From 3eadd1a9ead7a009a9abfbd603a5efd0560473cc Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Wed, 19 May 2021 13:05:33 -0400 Subject: [PATCH] Keep pod worker running until pod is truly complete A number of race conditions exist when pods are terminated early in their lifecycle because components in the kubelet need to know "no running containers" or "containers can't be started from now on" but were relying on outdated state. Only the pod worker knows whether containers are being started for a given pod, which is required to know when a pod is "terminated" (no running containers, none coming). Move that responsibility and podKiller function into the pod workers, and have everything that was killing the pod go into the UpdatePod loop. Split syncPod into three phases - setup, terminate containers, and cleanup pod - and have transitions between those methods be visible to other components. After this change, to kill a pod you tell the pod worker to UpdatePod({UpdateType: SyncPodKill, Pod: pod}). Several places in the kubelet were incorrect about whether they were handling terminating (should stop running, might have containers) or terminated (no running containers) pods. The pod worker exposes methods that allow other loops to know when to set up or tear down resources based on the state of the pod - these methods remove the possibility of race conditions by ensuring a single component is responsible for knowing each pod's allowed state and other components simply delegate to checking whether they are in the window by UID. Removing containers now no longer blocks final pod deletion in the API server and are handled as background cleanup. Node shutdown no longer marks pods as failed as they can be restarted in the next step. See https://docs.google.com/document/d/1Pic5TPntdJnYfIpBeZndDelM-AbS4FN9H2GTLFhoJ04/edit# for details --- pkg/kubelet/container/runtime.go | 3 +- pkg/kubelet/eviction/eviction_manager.go | 12 +- pkg/kubelet/eviction/eviction_manager_test.go | 10 +- pkg/kubelet/eviction/types.go | 4 +- pkg/kubelet/kubelet.go | 329 +++--- pkg/kubelet/kubelet_pods.go | 396 +++---- pkg/kubelet/kubelet_test.go | 174 ++-- pkg/kubelet/kubelet_volumes.go | 5 - pkg/kubelet/kubelet_volumes_test.go | 8 +- .../kuberuntime/fake_kuberuntime_manager.go | 20 +- pkg/kubelet/kuberuntime/kuberuntime_gc.go | 17 +- .../kuberuntime/kuberuntime_gc_test.go | 430 ++++---- .../kuberuntime/kuberuntime_manager.go | 10 +- .../kuberuntime/kuberuntime_manager_test.go | 11 +- .../nodeshutdown_manager_linux.go | 17 +- .../nodeshutdown_manager_linux_test.go | 6 +- pkg/kubelet/pod/pod_manager.go | 2 +- pkg/kubelet/pod/testing/mock_manager.go | 24 +- pkg/kubelet/pod_workers.go | 984 +++++++++++++++--- pkg/kubelet/pod_workers_test.go | 346 +++++- pkg/kubelet/preemption/preemption.go | 16 +- pkg/kubelet/preemption/preemption_test.go | 4 +- pkg/kubelet/runonce.go | 10 +- pkg/kubelet/runonce_test.go | 5 +- pkg/kubelet/status/status_manager.go | 39 +- pkg/kubelet/types/pod_update.go | 4 +- .../desired_state_of_world_populator.go | 63 +- .../desired_state_of_world_populator_test.go | 114 +- pkg/kubelet/volumemanager/volume_manager.go | 65 +- .../volumemanager/volume_manager_fake.go | 5 + .../volumemanager/volume_manager_test.go | 22 +- pkg/volume/util/util.go | 5 +- test/e2e/apps/statefulset.go | 9 +- test/e2e/node/pods.go | 2 + 34 files changed, 2066 insertions(+), 1105 deletions(-) diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index b0baa29e4a2..45ed19063bd 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -92,7 +92,8 @@ type Runtime interface { // file). In this case, garbage collector should refrain itself from aggressive // behavior such as removing all containers of unrecognized pods (yet). // If evictNonDeletedPods is set to true, containers and sandboxes belonging to pods - // that are terminated, but not deleted will be evicted. Otherwise, only deleted pods will be GC'd. + // that are terminated, but not deleted will be evicted. Otherwise, only deleted pods + // will be GC'd. // TODO: Revisit this method and make it cleaner. GarbageCollect(gcPolicy GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error // SyncPod syncs the running pod into the desired pod. diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 338858e0dcf..a863e750ca8 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -561,15 +561,15 @@ func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg klog.ErrorS(nil, "Eviction manager: cannot evict a critical pod", "pod", klog.KObj(pod)) return false } - status := v1.PodStatus{ - Phase: v1.PodFailed, - Message: evictMsg, - Reason: Reason, - } // record that we are evicting the pod m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg) // this is a blocking call and should only return when the pod and its containers are killed. - err := m.killPodFunc(pod, status, &gracePeriodOverride) + klog.V(3).InfoS("Evicting pod", "pod", klog.KObj(pod), "podUID", pod.UID, "message", evictMsg) + err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) { + status.Phase = v1.PodFailed + status.Reason = Reason + status.Message = evictMsg + }) if err != nil { klog.ErrorS(err, "Eviction manager: pod failed to evict", "pod", klog.KObj(pod)) } else { diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index afd611530c9..ddda845d3ad 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -21,7 +21,7 @@ import ( "testing" "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/clock" @@ -46,14 +46,16 @@ const ( // mockPodKiller is used to testing which pod is killed type mockPodKiller struct { pod *v1.Pod - status v1.PodStatus + evict bool + statusFn func(*v1.PodStatus) gracePeriodOverride *int64 } // killPodNow records the pod that was killed -func (m *mockPodKiller) killPodNow(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { +func (m *mockPodKiller) killPodNow(pod *v1.Pod, evict bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error { m.pod = pod - m.status = status + m.statusFn = statusFn + m.evict = evict m.gracePeriodOverride = gracePeriodOverride return nil } diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index 90c99809561..95214d84540 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -19,7 +19,7 @@ package eviction import ( "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" @@ -92,7 +92,7 @@ type ContainerGC interface { // pod - the pod to kill // status - the desired status to associate with the pod (i.e. why its killed) // gracePeriodOverride - the grace period override to use instead of what is on the pod spec -type KillPodFunc func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error +type KillPodFunc func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, fn func(*v1.PodStatus)) error // MirrorPodFunc returns the mirror pod for the given static pod and // whether it was known to the pod manager. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index ebf2fbf5e2f..f0653e1b36a 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -17,6 +17,7 @@ limitations under the License. package kubelet import ( + "context" "crypto/tls" "fmt" "math" @@ -136,6 +137,11 @@ const ( // Period for performing global cleanup tasks. housekeepingPeriod = time.Second * 2 + // Duration at which housekeeping failed to satisfy the invariant that + // housekeeping should be fast to avoid blocking pod config (while + // housekeeping is running no new pods are started or deleted). + housekeepingWarningDuration = time.Second * 15 + // Period for performing eviction monitoring. // ensure this is kept in sync with internal cadvisor housekeeping. evictionMonitoringPeriod = time.Second * 10 @@ -626,6 +632,20 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.containerLogManager = logs.NewStubContainerLogManager() } + klet.reasonCache = NewReasonCache() + klet.workQueue = queue.NewBasicWorkQueue(klet.clock) + klet.podWorkers = newPodWorkers( + klet.syncPod, + klet.syncTerminatingPod, + klet.syncTerminatedPod, + + kubeDeps.Recorder, + klet.workQueue, + klet.resyncInterval, + backOffPeriod, + klet.podCache, + ) + runtime, err := kuberuntime.NewKubeGenericRuntimeManager( kubecontainer.FilterEventRecorder(kubeDeps.Recorder), klet.livenessManager, @@ -633,7 +653,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.startupManager, seccompProfileRoot, machineInfo, - klet, + klet.podWorkers, kubeDeps.OSInterface, klet, httpClient, @@ -764,7 +784,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeCfg.EnableControllerAttachDetach, nodeName, klet.podManager, - klet.statusManager, + klet.podWorkers, klet.kubeClient, klet.volumePluginMgr, klet.containerRuntime, @@ -776,12 +796,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, keepTerminatedPodVolumes, volumepathhandler.NewBlockVolumePathHandler()) - klet.reasonCache = NewReasonCache() - klet.workQueue = queue.NewBasicWorkQueue(klet.clock) - klet.podWorkers = newPodWorkers(klet.syncPod, kubeDeps.Recorder, klet.workQueue, klet.resyncInterval, backOffPeriod, klet.podCache) - klet.backOff = flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff) - klet.podKiller = NewPodKiller(klet) // setup eviction manager evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.podManager.GetMirrorPodByPod, klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock) @@ -1087,9 +1102,6 @@ type Kubelet struct { // Container restart Backoff backOff *flowcontrol.Backoff - // Pod killer handles pods to be killed - podKiller PodKiller - // Information about the ports which are opened by daemons on Node running this Kubelet server. daemonEndpoints *v1.NodeDaemonEndpoints @@ -1452,10 +1464,6 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { kl.initNetworkUtil() } - // Start a goroutine responsible for killing pods (that are not properly - // handled by pod workers). - go wait.Until(kl.podKiller.PerformPodKillingWork, 1*time.Second, wait.NeverStop) - // Start component sync loops. kl.statusManager.Start() @@ -1469,7 +1477,12 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { kl.syncLoop(updates, kl) } -// syncPod is the transaction script for the sync of a single pod. +// syncPod is the transaction script for the sync of a single pod (setting up) +// a pod. The reverse (teardown) is handled in syncTerminatingPod and +// syncTerminatedPod. If syncPod exits without error, then the pod runtime +// state is in sync with the desired configuration state (pod is running). +// If syncPod exits with a transient error, the next invocation of syncPod +// is expected to make progress towards reaching the runtime state. // // Arguments: // @@ -1481,7 +1494,7 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { // * If the pod is being seen as running for the first time, record pod // start latency // * Update the status of the pod in the status manager -// * Kill the pod if it should not be running +// * Kill the pod if it should not be running due to soft admission // * Create a mirror pod if the pod is a static pod, and does not // already have a mirror pod // * Create the data directories for the pod if they do not exist @@ -1496,39 +1509,9 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { // This operation writes all events that are dispatched in order to provide // the most accurate information possible about an error situation to aid debugging. // Callers should not throw an event if this operation returns an error. -func (kl *Kubelet) syncPod(o syncPodOptions) error { - // pull out the required options - pod := o.pod - mirrorPod := o.mirrorPod - podStatus := o.podStatus - updateType := o.updateType - - // if we want to kill a pod, do it now! - if updateType == kubetypes.SyncPodKill { - killPodOptions := o.killPodOptions - if killPodOptions == nil || killPodOptions.PodStatusFunc == nil { - return fmt.Errorf("kill pod options are required if update type is kill") - } - apiPodStatus := killPodOptions.PodStatusFunc(pod, podStatus) - kl.statusManager.SetPodStatus(pod, apiPodStatus) - // we kill the pod with the specified grace period since this is a termination - if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil { - kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) - // there was an error killing the pod, so we return that error directly - utilruntime.HandleError(err) - return err - } - return nil - } - - // If a pod is still gracefully terminating, then we do not want to - // take further action. This mitigates static pods and deleted pods - // from getting rerun prematurely or their cgroups being deleted before - // the runtime cleans up. - podFullName := kubecontainer.GetPodFullName(pod) - if kl.podKiller.IsPodPendingTerminationByPodName(podFullName) { - return fmt.Errorf("pod %q is pending termination", podFullName) - } +func (kl *Kubelet) syncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + klog.V(4).InfoS("syncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + defer klog.V(4).InfoS("syncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) // Latency measurements for the main workflow are relative to the // first time the pod was seen by the API server. @@ -1565,16 +1548,15 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { podStatus.IPs = []string{apiPodStatus.PodIP} } - // Record the time it takes for the pod to become running. - existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) - if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && - !firstSeenTime.IsZero() { - metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) - } - + // If the pod should not be running, we request the pod's containers be stopped. This is not the same + // as termination (we want to stop the pod, but potentially restart it later if soft admission allows + // it later). Set the status and phase appropriately runnable := kl.canRunPod(pod) if !runnable.Admit { - // Pod is not runnable; update the Pod and Container statuses to why. + // Pod is not runnable; and update the Pod and Container statuses to why. + if apiPodStatus.Phase != v1.PodFailed && apiPodStatus.Phase != v1.PodSucceeded { + apiPodStatus.Phase = v1.PodPending + } apiPodStatus.Reason = runnable.Reason apiPodStatus.Message = runnable.Message // Waiting containers are not creating. @@ -1591,22 +1573,28 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { } } - // Update status in the status manager + // Record the time it takes for the pod to become running. + existingStatus, ok := kl.statusManager.GetPodStatus(pod.UID) + if !ok || existingStatus.Phase == v1.PodPending && apiPodStatus.Phase == v1.PodRunning && + !firstSeenTime.IsZero() { + metrics.PodStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime)) + } + kl.statusManager.SetPodStatus(pod, apiPodStatus) - // Kill pod if it should not be running - if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed { + // Pods that are not runnable must be stopped - return a typed error to the pod worker + if !runnable.Admit { + klog.V(2).InfoS("Pod is not runnable and must have running containers stopped", "pod", klog.KObj(pod), "podUID", pod.UID, "message", runnable.Message) var syncErr error - if err := kl.killPod(pod, nil, podStatus, nil); err != nil { + p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) + if err := kl.killPod(pod, p, nil); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) syncErr = fmt.Errorf("error killing pod: %v", err) utilruntime.HandleError(syncErr) } else { - if !runnable.Admit { - // There was no error killing the pod, but the pod cannot be run. - // Return an error to signal that the sync loop should back off. - syncErr = fmt.Errorf("pod cannot be run: %s", runnable.Message) - } + // There was no error killing the pod, but the pod cannot be run. + // Return an error to signal that the sync loop should back off. + syncErr = fmt.Errorf("pod cannot be run: %s", runnable.Message) } return syncErr } @@ -1622,7 +1610,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { pcm := kl.containerManager.NewPodContainerManager() // If pod has already been terminated then we need not create // or update the pod's cgroup - if !kl.podIsTerminated(pod) { + // TODO: once context cancellation is added this check can be removed + if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { // When the kubelet is restarted with the cgroups-per-qos // flag enabled, all the pod's running containers // should be killed intermittently and brought back up @@ -1639,7 +1628,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { // exists or the pod is running for the first time podKilled := false if !pcm.Exists(pod) && !firstSync { - if err := kl.killPod(pod, nil, podStatus, nil); err == nil { + p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) + if err := kl.killPod(pod, p, nil); err == nil { podKilled = true } else { klog.ErrorS(err, "KillPod failed", "pod", klog.KObj(pod), "podStatus", podStatus) @@ -1673,6 +1663,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { // The mirror pod is semantically different from the static pod. Remove // it. The mirror pod will get recreated later. klog.InfoS("Trying to delete pod", "pod", klog.KObj(pod), "podUID", mirrorPod.ObjectMeta.UID) + podFullName := kubecontainer.GetPodFullName(pod) var err error deleted, err = kl.podManager.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID) if deleted { @@ -1702,8 +1693,9 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { return err } - // Volume manager will not mount volumes for terminated pods - if !kl.podIsTerminated(pod) { + // Volume manager will not mount volumes for terminating pods + // TODO: once context cancellation is added this check can be removed + if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { // Wait for volumes to attach/mount if err := kl.volumeManager.WaitForAttachAndMount(pod); err != nil { kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to attach or mount volumes: %v", err) @@ -1734,6 +1726,125 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { return nil } +// syncTerminatingPod is expected to terminate all running containers in a pod. Once this method +// returns without error, the pod's local state can be safely cleaned up. If runningPod is passed, +// we perform no status updates. +func (kl *Kubelet) syncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { + klog.V(4).InfoS("syncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + defer klog.V(4).InfoS("syncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) + + // when we receive a runtime only pod (runningPod != nil) we don't need to update the status + // manager or refresh the status of the cache, because a successful killPod will ensure we do + // not get invoked again + if runningPod != nil { + // we kill the pod with the specified grace period since this is a termination + if gracePeriod != nil { + klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod) + } else { + klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil) + } + if err := kl.killPod(pod, *runningPod, gracePeriod); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) + // there was an error killing the pod, so we return that error directly + utilruntime.HandleError(err) + return err + } + klog.V(4).InfoS("Pod termination stopped all running orphan containers", "pod", klog.KObj(pod), "podUID", pod.UID) + return nil + } + + apiPodStatus := kl.generateAPIPodStatus(pod, podStatus) + if podStatusFn != nil { + podStatusFn(&apiPodStatus) + } + kl.statusManager.SetPodStatus(pod, apiPodStatus) + + if gracePeriod != nil { + klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod) + } else { + klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil) + } + p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus) + if err := kl.killPod(pod, p, gracePeriod); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) + // there was an error killing the pod, so we return that error directly + utilruntime.HandleError(err) + return err + } + + // Guard against consistency issues in KillPod implementations by checking that there are no + // running containers. This method is invoked infrequently so this is effectively free and can + // catch race conditions introduced by callers updating pod status out of order. + // TODO: have KillPod return the terminal status of stopped containers and write that into the + // cache immediately + podStatus, err := kl.containerRuntime.GetPodStatus(pod.UID, pod.Name, pod.Namespace) + if err != nil { + klog.ErrorS(err, "Unable to read pod status prior to final pod termination", "pod", klog.KObj(pod), "podUID", pod.UID) + return err + } + var runningContainers []string + var containers []string + for _, s := range podStatus.ContainerStatuses { + if s.State == kubecontainer.ContainerStateRunning { + runningContainers = append(runningContainers, s.ID.String()) + } + containers = append(containers, fmt.Sprintf("(%s state=%s exitCode=%d finishedAt=%s)", s.Name, s.State, s.ExitCode, s.FinishedAt.UTC().Format(time.RFC3339Nano))) + } + if klog.V(4).Enabled() { + sort.Strings(containers) + klog.InfoS("Post-termination container state", "pod", klog.KObj(pod), "podUID", pod.UID, "containers", strings.Join(containers, " ")) + } + if len(runningContainers) > 0 { + return fmt.Errorf("detected running containers after a successful KillPod, CRI violation: %v", runningContainers) + } + + // we have successfully stopped all containers, the pod is terminating, our status is "done" + klog.V(4).InfoS("Pod termination stopped all running containers", "pod", klog.KObj(pod), "podUID", pod.UID) + + return nil +} + +// syncTerminatedPod cleans up a pod that has terminated (has no running containers). +// The invocations in this call are expected to tear down what PodResourcesAreReclaimed checks (which +// gates pod deletion). When this method exits the pod is expected to be ready for cleanup. +// TODO: make this method take a context and exit early +func (kl *Kubelet) syncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + klog.V(4).InfoS("syncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + defer klog.V(4).InfoS("syncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) + + // generate the final status of the pod + // TODO: should we simply fold this into TerminatePod? that would give a single pod update + apiPodStatus := kl.generateAPIPodStatus(pod, podStatus) + kl.statusManager.SetPodStatus(pod, apiPodStatus) + + // volumes are unmounted after the pod worker reports ShouldPodRuntimeBeRemoved (which is satisfied + // before syncTerminatedPod is invoked) + if err := kl.volumeManager.WaitForUnmount(pod); err != nil { + return err + } + klog.V(4).InfoS("Pod termination unmounted volumes", "pod", klog.KObj(pod), "podUID", pod.UID) + + // Note: we leave pod containers to be reclaimed in the background since dockershim requires the + // container for retrieving logs and we want to make sure logs are available until the pod is + // physically deleted. + + // remove any cgroups in the hierarchy for pods that are no longer running. + if kl.cgroupsPerQOS { + pcm := kl.containerManager.NewPodContainerManager() + name, _ := pcm.GetPodContainerName(pod) + if err := pcm.Destroy(name); err != nil { + return err + } + klog.V(4).InfoS("Pod termination removed cgroups", "pod", klog.KObj(pod), "podUID", pod.UID) + } + + // mark the final pod status + kl.statusManager.TerminatePod(pod) + klog.V(4).InfoS("Pod is terminated and will need no more status updates", "pod", klog.KObj(pod), "podUID", pod.UID) + + return nil +} + // Get pods which should be resynchronized. Currently, the following pod should be resynchronized: // * pod whose work is ready. // * internal modules that request sync of a pod. @@ -1776,28 +1887,11 @@ func (kl *Kubelet) deletePod(pod *v1.Pod) error { // for sources that haven't reported yet. return fmt.Errorf("skipping delete because sources aren't ready yet") } - kl.podWorkers.ForgetWorker(pod.UID) - - // make sure our runtimeCache is at least as fresh as the last container started event we observed. - // this ensures we correctly send graceful deletion signals to all containers we've reported started. - if lastContainerStarted, ok := kl.lastContainerStartedTime.Get(pod.UID); ok { - if err := kl.runtimeCache.ForceUpdateIfOlder(lastContainerStarted); err != nil { - return fmt.Errorf("error updating containers: %v", err) - } - } - // Runtime cache may not have been updated to with the pod, but it's okay - // because the periodic cleanup routine will attempt to delete again later. - runningPods, err := kl.runtimeCache.GetPods() - if err != nil { - return fmt.Errorf("error listing containers: %v", err) - } - runningPod := kubecontainer.Pods(runningPods).FindPod("", pod.UID) - if runningPod.IsEmpty() { - return fmt.Errorf("pod not found") - } - podPair := kubecontainer.PodPair{APIPod: pod, RunningPod: &runningPod} - kl.podKiller.KillPod(&podPair) - + klog.V(3).InfoS("Pod has been deleted and must be killed", "pod", klog.KObj(pod), "podUID", pod.UID) + kl.podWorkers.UpdatePod(UpdatePodOptions{ + Pod: pod, + UpdateType: kubetypes.SyncPodKill, + }) // We leave the volume/directory cleanup to the periodic cleanup routine. return nil } @@ -1835,7 +1929,7 @@ func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, strin func (kl *Kubelet) canRunPod(pod *v1.Pod) lifecycle.PodAdmitResult { attrs := &lifecycle.PodAdmitAttributes{Pod: pod} // Get "OtherPods". Rejected pods are failed, so only include admitted pods that are alive. - attrs.OtherPods = kl.filterOutTerminatedPods(kl.podManager.GetPods()) + attrs.OtherPods = kl.GetActivePods() for _, handler := range kl.softAdmitHandlers { if result := handler.Admit(attrs); !result.Admit { @@ -2025,10 +2119,16 @@ func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handle // skip housekeeping, as we may accidentally delete pods from unready sources. klog.V(4).InfoS("SyncLoop (housekeeping, skipped): sources aren't ready yet") } else { + start := time.Now() klog.V(4).InfoS("SyncLoop (housekeeping)") if err := handler.HandlePodCleanups(); err != nil { klog.ErrorS(err, "Failed cleaning pods") } + duration := time.Since(start) + if duration > housekeepingWarningDuration { + klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than 15s", "seconds", duration.Seconds()) + } + klog.V(4).InfoS("SyncLoop (housekeeping) end") } } return true @@ -2049,31 +2149,12 @@ func handleProbeSync(kl *Kubelet, update proberesults.Update, handler SyncHandle // dispatchWork starts the asynchronous sync of the pod in a pod worker. // If the pod has completed termination, dispatchWork will perform no action. func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mirrorPod *v1.Pod, start time.Time) { - // check whether we are ready to delete the pod from the API server (all status up to date) - containersTerminal, podWorkerTerminal := kl.podAndContainersAreTerminal(pod) - if pod.DeletionTimestamp != nil && containersTerminal { - klog.V(4).InfoS("Pod has completed execution and should be deleted from the API server", "pod", klog.KObj(pod), "syncType", syncType) - kl.statusManager.TerminatePod(pod) - return - } - - // optimization: avoid invoking the pod worker if no further changes are possible to the pod definition - // (i.e. the pod has completed and its containers have been terminated) - if podWorkerTerminal && containersTerminal { - klog.V(4).InfoS("Pod has completed and its containers have been terminated, ignoring remaining sync work", "pod", klog.KObj(pod), "syncType", syncType) - return - } - // Run the sync in an async worker. - kl.podWorkers.UpdatePod(&UpdatePodOptions{ + kl.podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, MirrorPod: mirrorPod, UpdateType: syncType, - OnCompleteFunc: func(err error) { - if err != nil { - metrics.PodWorkerDuration.WithLabelValues(syncType.String()).Observe(metrics.SinceInSeconds(start)) - } - }, + StartTime: start, }) // Note the number of containers for new pods. if syncType == kubetypes.SyncPodCreate { @@ -2109,10 +2190,13 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) { continue } - if !kl.podIsTerminated(pod) { - // Only go through the admission process if the pod is not - // terminated. - + // Only go through the admission process if the pod is not requested + // for termination by another part of the kubelet. If the pod is already + // using resources (previously admitted), the pod worker is going to be + // shutting it down. If the pod hasn't started yet, we know that when + // the pod worker is invoked it will also avoid setting up the pod, so + // we simply avoid doing any work. + if !kl.podWorkers.IsPodTerminationRequested(pod.UID) { // We failed pods that we rejected, so activePods include all admitted // pods that are alive. activePods := kl.filterOutTerminatedPods(existingPods) @@ -2286,13 +2370,8 @@ func (kl *Kubelet) ListenAndServePodResources() { // Delete the eligible dead container instances in a pod. Depending on the configuration, the latest dead containers may be kept around. func (kl *Kubelet) cleanUpContainersInPod(podID types.UID, exitedContainerID string) { if podStatus, err := kl.podCache.Get(podID); err == nil { - removeAll := false - if syncedPod, ok := kl.podManager.GetPodByUID(podID); ok { - // generate the api status using the cached runtime status to get up-to-date ContainerStatuses - apiPodStatus := kl.generateAPIPodStatus(syncedPod, podStatus) - // When an evicted or deleted pod has already synced, all containers can be removed. - removeAll = eviction.PodIsEvicted(syncedPod.Status) || (syncedPod.DeletionTimestamp != nil && notRunning(apiPodStatus.ContainerStatuses)) - } + // When an evicted or deleted pod has already synced, all containers can be removed. + removeAll := kl.podWorkers.ShouldPodContentBeRemoved(podID) kl.containerDeletor.deleteContainersInPod(exitedContainerID, podStatus, removeAll) } } diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index f78fde8ee3c..eb43d953b97 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -30,7 +30,6 @@ import ( "runtime" "sort" "strings" - "sync" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" @@ -54,7 +53,6 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cri/streaming/portforward" remotecommandserver "k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand" "k8s.io/kubernetes/pkg/kubelet/envvars" - "k8s.io/kubernetes/pkg/kubelet/eviction" "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" @@ -71,11 +69,6 @@ import ( const ( managedHostsHeader = "# Kubernetes-managed hosts file.\n" managedHostsHeaderWithHostNetwork = "# Kubernetes-managed hosts file (host network).\n" - - // Capacity of the channel for storing pods to kill. A small number should - // suffice because a goroutine is dedicated to check the channel and does - // not block on anything else. - podKillingChannelCapacity = 50 ) // Container state reason list @@ -99,7 +92,9 @@ func (kl *Kubelet) listPodsFromDisk() ([]types.UID, error) { return pods, nil } -// GetActivePods returns non-terminal pods +// GetActivePods returns pods that may have a running container (a +// terminated pod is one that is known to have no running containers and +// will not get any more). func (kl *Kubelet) GetActivePods() []*v1.Pod { allPods := kl.podManager.GetPods() activePods := kl.filterOutTerminatedPods(allPods) @@ -837,18 +832,11 @@ func containerResourceRuntimeValue(fs *v1.ResourceFieldSelector, pod *v1.Pod, co return resource.ExtractResourceValueByContainerName(fs, pod, containerName) } -// One of the following arguments must be non-nil: runningPod, status. -func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error { - var p kubecontainer.Pod - if runningPod != nil { - p = *runningPod - } else if status != nil { - p = kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), status) - } else { - return fmt.Errorf("one of the two arguments must be non-nil: runningPod, status") - } - - // Call the container runtime KillPod method which stops all running containers of the pod +// killPod instructs the container runtime to kill the pod. This method requires that +// the pod status contains the result of the last syncPod, otherwise it may fail to +// terminate newly created containers and sandboxes. +func (kl *Kubelet) killPod(pod *v1.Pod, p kubecontainer.Pod, gracePeriodOverride *int64) error { + // Call the container runtime KillPod method which stops all known running containers of the pod if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil { return err } @@ -896,83 +884,38 @@ func (kl *Kubelet) getPullSecretsForPod(pod *v1.Pod) []v1.Secret { return pullSecrets } -// podStatusIsTerminal reports when the specified pod has no running containers or is no longer accepting -// spec changes. -func (kl *Kubelet) podAndContainersAreTerminal(pod *v1.Pod) (containersTerminal, podWorkerTerminal bool) { - // Check the cached pod status which was set after the last sync. - status, ok := kl.statusManager.GetPodStatus(pod.UID) - if !ok { - // If there is no cached status, use the status from the - // apiserver. This is useful if kubelet has recently been - // restarted. - status = pod.Status +func countRunningContainerStatus(status v1.PodStatus) int { + var runningContainers int + for _, c := range status.InitContainerStatuses { + if c.State.Running != nil { + runningContainers++ + } } - // A pod transitions into failed or succeeded from either container lifecycle (RestartNever container - // fails) or due to external events like deletion or eviction. A terminal pod *should* have no running - // containers, but to know that the pod has completed its lifecycle you must wait for containers to also - // be terminal. - containersTerminal = notRunning(status.ContainerStatuses) - // The kubelet must accept config changes from the pod spec until it has reached a point where changes would - // have no effect on any running container. - podWorkerTerminal = status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && containersTerminal) - return -} - -// podIsTerminated returns true if the provided pod is in a terminal phase ("Failed", "Succeeded") or -// has been deleted and has no running containers. This corresponds to when a pod must accept changes to -// its pod spec (e.g. terminating containers allow grace period to be shortened). -func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool { - _, podWorkerTerminal := kl.podAndContainersAreTerminal(pod) - return podWorkerTerminal -} - -// IsPodTerminated returns true if the pod with the provided UID is in a terminal phase ("Failed", -// "Succeeded") or has been deleted and has no running containers. This corresponds to when a pod must -// accept changes to its pod spec (e.g. terminating containers allow grace period to be shortened) -func (kl *Kubelet) IsPodTerminated(uid types.UID) bool { - pod, podFound := kl.podManager.GetPodByUID(uid) - if !podFound { - return true + for _, c := range status.ContainerStatuses { + if c.State.Running != nil { + runningContainers++ + } } - return kl.podIsTerminated(pod) -} - -// IsPodDeleted returns true if the pod is deleted. For the pod to be deleted, either: -// 1. The pod object is deleted -// 2. The pod's status is evicted -// 3. The pod's deletion timestamp is set, and containers are not running -func (kl *Kubelet) IsPodDeleted(uid types.UID) bool { - pod, podFound := kl.podManager.GetPodByUID(uid) - if !podFound { - return true + for _, c := range status.EphemeralContainerStatuses { + if c.State.Running != nil { + runningContainers++ + } } - status, statusFound := kl.statusManager.GetPodStatus(pod.UID) - if !statusFound { - status = pod.Status - } - return eviction.PodIsEvicted(status) || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses)) + return runningContainers } // PodResourcesAreReclaimed returns true if all required node-level resources that a pod was consuming have // been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the API server. func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bool { - if !notRunning(status.ContainerStatuses) { + if kl.podWorkers.CouldHaveRunningContainers(pod.UID) { // We shouldn't delete pods that still have running containers klog.V(3).InfoS("Pod is terminated, but some containers are still running", "pod", klog.KObj(pod)) return false } - // pod's containers should be deleted - runtimeStatus, err := kl.podCache.Get(pod.UID) - if err != nil { - klog.V(3).InfoS("Pod is terminated, Error getting runtimeStatus from the podCache", "pod", klog.KObj(pod), "err", err) - return false - } - if len(runtimeStatus.ContainerStatuses) > 0 { - var statusStr string - for _, status := range runtimeStatus.ContainerStatuses { - statusStr += fmt.Sprintf("%+v ", *status) - } - klog.V(3).InfoS("Pod is terminated, but some containers have not been cleaned up", "pod", klog.KObj(pod), "status", statusStr) + if count := countRunningContainerStatus(status); count > 0 { + // We shouldn't delete pods until the reported pod status contains no more running containers (the previous + // check ensures no more status can be generated, this check verifies we have seen enough of the status) + klog.V(3).InfoS("Pod is terminated, but some container status has not yet been reported", "pod", klog.KObj(pod), "running", count) return false } if kl.podVolumesExist(pod.UID) && !kl.keepTerminatedPodVolumes { @@ -987,6 +930,12 @@ func (kl *Kubelet) PodResourcesAreReclaimed(pod *v1.Pod, status v1.PodStatus) bo return false } } + + // Note: we leave pod containers to be reclaimed in the background since dockershim requires the + // container for retrieving logs and we want to make sure logs are available until the pod is + // physically deleted. + + klog.V(3).InfoS("Pod is terminated and all resources are reclaimed", "pod", klog.KObj(pod)) return true } @@ -999,23 +948,12 @@ func (kl *Kubelet) podResourcesAreReclaimed(pod *v1.Pod) bool { return kl.PodResourcesAreReclaimed(pod, status) } -// notRunning returns true if every status is terminated or waiting, or the status list -// is empty. -func notRunning(statuses []v1.ContainerStatus) bool { - for _, status := range statuses { - if status.State.Terminated == nil && status.State.Waiting == nil { - return false - } - } - return true -} - -// filterOutTerminatedPods returns the given pods which the status manager -// does not consider failed or succeeded. +// filterOutTerminatedPods returns the pods that could still have running +// containers func (kl *Kubelet) filterOutTerminatedPods(pods []*v1.Pod) []*v1.Pod { var filteredPods []*v1.Pod for _, p := range pods { - if kl.podIsTerminated(p) { + if !kl.podWorkers.CouldHaveRunningContainers(p.UID) { continue } filteredPods = append(filteredPods, p) @@ -1040,9 +978,9 @@ func (kl *Kubelet) removeOrphanedPodStatuses(pods []*v1.Pod, mirrorPods []*v1.Po // If pod killing is done, podManager.DeleteMirrorPod() is called to delete mirror pod // from the API server func (kl *Kubelet) deleteOrphanedMirrorPods() { - podFullNames := kl.podManager.GetOrphanedMirrorPodNames() - for _, podFullname := range podFullNames { - if !kl.podKiller.IsPodPendingTerminationByPodName(podFullname) { + mirrorPods := kl.podManager.GetOrphanedMirrorPodNames() + for _, podFullname := range mirrorPods { + if !kl.podWorkers.IsPodForMirrorPodTerminatingByFullName(podFullname) { _, err := kl.podManager.DeleteMirrorPod(podFullname, nil) if err != nil { klog.ErrorS(err, "Encountered error when deleting mirror pod", "podName", podFullname) @@ -1055,7 +993,8 @@ func (kl *Kubelet) deleteOrphanedMirrorPods() { // HandlePodCleanups performs a series of cleanup work, including terminating // pod workers, killing unwanted pods, and removing orphaned volumes/pod -// directories. +// directories. No config changes are sent to pod workers while this method +// is executing which means no new pods can appear. // NOTE: This function is executed by the main sync loop, so it // should not contain any blocking calls. func (kl *Kubelet) HandlePodCleanups() error { @@ -1086,43 +1025,92 @@ func (kl *Kubelet) HandlePodCleanups() error { // to the apiserver, it could still restart the terminated pod (even // though the pod was not considered terminated by the apiserver). // These two conditions could be alleviated by checkpointing kubelet. - activePods := kl.filterOutTerminatedPods(allPods) - desiredPods := make(map[types.UID]sets.Empty) - for _, pod := range activePods { - desiredPods[pod.UID] = sets.Empty{} + // Stop the workers for terminated pods not in the config source + klog.V(3).InfoS("Clean up pod workers for terminated pods") + workingPods := kl.podWorkers.SyncKnownPods(allPods) + + allPodsByUID := make(map[types.UID]*v1.Pod) + for _, pod := range allPods { + allPodsByUID[pod.UID] = pod } - // Stop the workers for no-longer existing pods. - kl.podWorkers.ForgetNonExistingPodWorkers(desiredPods) - kl.probeManager.CleanupPods(desiredPods) - runningPods, err := kl.runtimeCache.GetPods() + // Identify the set of pods that have workers, which should be all pods + // from config that are not terminated, as well as any terminating pods + // that have already been removed from config. Pods that are terminating + // will be added to possiblyRunningPods, to prevent overly aggressive + // cleanup of pod cgroups. + runningPods := make(map[types.UID]sets.Empty) + possiblyRunningPods := make(map[types.UID]sets.Empty) + for uid, sync := range workingPods { + switch sync { + case SyncPodWork: + runningPods[uid] = struct{}{} + possiblyRunningPods[uid] = struct{}{} + case TerminatingPodWork: + possiblyRunningPods[uid] = struct{}{} + } + } + + // Stop probing pods that are not running + klog.V(3).InfoS("Clean up probes for terminating and terminated pods") + kl.probeManager.CleanupPods(runningPods) + + // Terminate any pods that are observed in the runtime but not + // present in the list of known running pods from config. + runningRuntimePods, err := kl.runtimeCache.GetPods() if err != nil { klog.ErrorS(err, "Error listing containers") return err } - for _, pod := range runningPods { - if _, found := desiredPods[pod.ID]; !found { - kl.podKiller.KillPod(&kubecontainer.PodPair{APIPod: nil, RunningPod: pod}) + for _, runningPod := range runningRuntimePods { + switch workType, ok := workingPods[runningPod.ID]; { + case ok && workType == SyncPodWork, ok && workType == TerminatingPodWork: + // if the pod worker is already in charge of this pod, we don't need to do anything + continue + default: + // If the pod isn't in the set that should be running and isn't already terminating, terminate + // now. This termination is aggressive because all known pods should already be in a known state + // (i.e. a removed static pod should already be terminating), so these are pods that were + // orphaned due to kubelet restart or bugs. Since housekeeping blocks other config changes, we + // know that another pod wasn't started in the background so we are safe to terminate the + // unknown pods. + if _, ok := allPodsByUID[runningPod.ID]; !ok { + klog.V(3).InfoS("Clean up orphaned pod containers", "podUID", runningPod.ID) + one := int64(1) + kl.podWorkers.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + RunningPod: runningPod, + KillPodOptions: &KillPodOptions{ + PodTerminationGracePeriodSecondsOverride: &one, + }, + }) + } } } + // Remove orphaned pod statuses not in the total list of known config pods + klog.V(3).InfoS("Clean up orphaned pod statuses") kl.removeOrphanedPodStatuses(allPods, mirrorPods) // Note that we just killed the unwanted pods. This may not have reflected // in the cache. We need to bypass the cache to get the latest set of // running pods to clean up the volumes. // TODO: Evaluate the performance impact of bypassing the runtime cache. - runningPods, err = kl.containerRuntime.GetPods(false) + runningRuntimePods, err = kl.containerRuntime.GetPods(false) if err != nil { klog.ErrorS(err, "Error listing containers") return err } - // Remove any orphaned volumes. - // Note that we pass all pods (including terminated pods) to the function, - // so that we don't remove volumes associated with terminated but not yet - // deleted pods. - err = kl.cleanupOrphanedPodDirs(allPods, runningPods) + // Remove orphaned volumes from pods that are known not to have any + // containers. Note that we pass all pods (including terminated pods) to + // the function, so that we don't remove volumes associated with terminated + // but not yet deleted pods. + // TODO: this method could more aggressively cleanup terminated pods + // in the future (volumes, mount dirs, logs, and containers could all be + // better separated) + klog.V(3).InfoS("Clean up orphaned pod directories") + err = kl.cleanupOrphanedPodDirs(allPods, runningRuntimePods) if err != nil { // We want all cleanup tasks to be run even if one of them failed. So // we just log an error here and continue other cleanup tasks. @@ -1130,162 +1118,23 @@ func (kl *Kubelet) HandlePodCleanups() error { klog.ErrorS(err, "Failed cleaning up orphaned pod directories") } - // Remove any orphaned mirror pods. + // Remove any orphaned mirror pods (mirror pods are tracked by name via the + // pod worker) + klog.V(3).InfoS("Clean up orphaned mirror pods") kl.deleteOrphanedMirrorPods() - // Remove any cgroups in the hierarchy for pods that are no longer running. + // Remove any cgroups in the hierarchy for pods that are definitely no longer + // running (not in the container runtime). if kl.cgroupsPerQOS { pcm := kl.containerManager.NewPodContainerManager() - kl.cleanupOrphanedPodCgroups(pcm, cgroupPods, activePods) + klog.V(3).InfoS("Clean up orphaned pod cgroups") + kl.cleanupOrphanedPodCgroups(pcm, cgroupPods, possiblyRunningPods) } kl.backOff.GC() return nil } -// PodKiller handles requests for killing pods -type PodKiller interface { - // KillPod receives pod speficier representing the pod to kill - KillPod(podPair *kubecontainer.PodPair) - // PerformPodKillingWork performs the actual pod killing work via calling CRI - // It returns after its Close() func is called and all outstanding pod killing requests are served - PerformPodKillingWork() - // Close ensures that after it's called, then this pod killer wouldn't accept any more pod killing requests - Close() - // IsPodPendingTerminationByPodName checks whether any pod for the given full pod name is pending termination (thread safe) - IsPodPendingTerminationByPodName(podFullname string) bool - // IsPodPendingTerminationByUID checks whether the mirror pod for the given uid is pending termination (thread safe) - IsPodPendingTerminationByUID(uid types.UID) bool -} - -// podKillerWithChannel is an implementation of PodKiller which receives pod killing requests via channel -type podKillerWithChannel struct { - // Channel for getting pods to kill. - podKillingCh chan *kubecontainer.PodPair - // lock for synchronization between HandlePodCleanups and pod killer - podKillingLock *sync.RWMutex - // mirrorPodTerminationMap keeps track of the progress of mirror pod termination - // The key is the UID of the pod and the value is the full name of the pod - mirrorPodTerminationMap map[string]string - // podTerminationMap keeps track of the progress of pod termination. - // The key is the UID of the pod and the value is the full name of the pod - podTerminationMap map[string]string - // killPod is the func which invokes runtime to kill the pod - killPod func(pod *v1.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error -} - -// NewPodKiller returns a functional PodKiller -func NewPodKiller(kl *Kubelet) PodKiller { - podKiller := &podKillerWithChannel{ - podKillingCh: make(chan *kubecontainer.PodPair, podKillingChannelCapacity), - podKillingLock: &sync.RWMutex{}, - mirrorPodTerminationMap: make(map[string]string), - podTerminationMap: make(map[string]string), - killPod: kl.killPod, - } - return podKiller -} - -// IsPodPendingTerminationByUID checks whether the pod for the given uid is pending termination -func (pk *podKillerWithChannel) IsPodPendingTerminationByUID(uid types.UID) bool { - pk.podKillingLock.RLock() - defer pk.podKillingLock.RUnlock() - if _, ok := pk.podTerminationMap[string(uid)]; ok { - return ok - } - if _, ok := pk.mirrorPodTerminationMap[string(uid)]; ok { - return ok - } - return false -} - -// IsPodPendingTerminationByPodName checks whether the given pod is in grace period of termination -func (pk *podKillerWithChannel) IsPodPendingTerminationByPodName(podFullname string) bool { - pk.podKillingLock.RLock() - defer pk.podKillingLock.RUnlock() - for _, name := range pk.mirrorPodTerminationMap { - if name == podFullname { - return true - } - } - for _, name := range pk.podTerminationMap { - if name == podFullname { - return true - } - } - return false -} - -func (pk *podKillerWithChannel) markPodTerminated(uid string) { - klog.V(4).InfoS("Marking pod termination", "podUID", uid) - pk.podKillingLock.Lock() - defer pk.podKillingLock.Unlock() - delete(pk.mirrorPodTerminationMap, uid) - delete(pk.podTerminationMap, uid) -} - -// KillPod sends pod killing request to the killer after marks the pod -// unless the given pod has been marked to be killed -func (pk *podKillerWithChannel) KillPod(podPair *kubecontainer.PodPair) { - pk.podKillingLock.Lock() - defer pk.podKillingLock.Unlock() - var apiPodExists bool - var runningPodExists bool - if podPair.APIPod != nil { - uid := string(podPair.APIPod.UID) - _, apiPodExists = pk.mirrorPodTerminationMap[uid] - if !apiPodExists { - fullname := kubecontainer.GetPodFullName(podPair.APIPod) - klog.V(4).InfoS("Marking api pod pending termination", "podName", fullname, "podUID", uid) - pk.mirrorPodTerminationMap[uid] = fullname - } - } - if podPair.RunningPod != nil { - uid := string(podPair.RunningPod.ID) - _, runningPodExists = pk.podTerminationMap[uid] - if !runningPodExists { - fullname := podPair.RunningPod.Name - klog.V(4).InfoS("Marking running pod pending termination", "podName", fullname, "podUID", uid) - pk.podTerminationMap[uid] = fullname - } - } - if apiPodExists || runningPodExists { - if apiPodExists && runningPodExists { - klog.V(4).InfoS("Api pod and running pod are pending termination", "apiPodUID", podPair.APIPod.UID, "runningPodUID", podPair.RunningPod.ID) - } else if apiPodExists { - klog.V(4).InfoS("Api pod is pending termination", "podUID", podPair.APIPod.UID) - } else { - klog.V(4).InfoS("Running pod is pending termination", "podUID", podPair.RunningPod.ID) - } - return - } - // Limit to one request per pod - pk.podKillingCh <- podPair -} - -// Close closes the channel through which requests are delivered -func (pk *podKillerWithChannel) Close() { - close(pk.podKillingCh) -} - -// PerformPodKillingWork launches a goroutine to kill a pod received from the channel if -// another goroutine isn't already in action. -func (pk *podKillerWithChannel) PerformPodKillingWork() { - for podPair := range pk.podKillingCh { - runningPod := podPair.RunningPod - apiPod := podPair.APIPod - - go func(apiPod *v1.Pod, runningPod *kubecontainer.Pod) { - klog.V(2).InfoS("Killing unwanted pod", "podName", runningPod.Name) - err := pk.killPod(apiPod, runningPod, nil, nil) - if err != nil { - klog.ErrorS(err, "Failed killing the pod", "podName", runningPod.Name) - } - pk.markPodTerminated(string(runningPod.ID)) - }(apiPod, runningPod) - } -} - // validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state // of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current // running container is preferred over a previous termination. If info about the container is not available then a specific @@ -1981,25 +1830,14 @@ func (kl *Kubelet) GetPortForward(podName, podNamespace string, podUID types.UID // cleanupOrphanedPodCgroups removes cgroups that should no longer exist. // it reconciles the cached state of cgroupPods with the specified list of runningPods -func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupPods map[types.UID]cm.CgroupName, activePods []*v1.Pod) { - // Add all running pods to the set that we want to preserve - podSet := sets.NewString() - for _, pod := range activePods { - podSet.Insert(string(pod.UID)) - } - +func (kl *Kubelet) cleanupOrphanedPodCgroups(pcm cm.PodContainerManager, cgroupPods map[types.UID]cm.CgroupName, possiblyRunningPods map[types.UID]sets.Empty) { // Iterate over all the found pods to verify if they should be running for uid, val := range cgroupPods { // if the pod is in the running set, its not a candidate for cleanup - if podSet.Has(string(uid)) { + if _, ok := possiblyRunningPods[uid]; ok { continue } - // if the pod is within termination grace period, we shouldn't cleanup the underlying cgroup - if kl.podKiller.IsPodPendingTerminationByUID(uid) { - klog.V(3).InfoS("Pod is pending termination", "podUID", uid) - continue - } // If volumes have not been unmounted/detached, do not delete the cgroup // so any memory backed volumes don't have their charges propagated to the // parent croup. If the volumes still exist, reduce the cpu shares for any diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 6786e55a460..8f78d672659 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -17,9 +17,11 @@ limitations under the License. package kubelet import ( + "context" "fmt" "io/ioutil" "os" + "reflect" "sort" "strconv" "testing" @@ -119,7 +121,6 @@ type TestKubelet struct { func (tk *TestKubelet) Cleanup() { if tk.kubelet != nil { - tk.kubelet.podKiller.Close() os.RemoveAll(tk.kubelet.rootDirectory) tk.kubelet = nil } @@ -293,12 +294,10 @@ func newTestKubeletWithImageList( fakeClock := clock.NewFakeClock(time.Now()) kubelet.backOff = flowcontrol.NewBackOff(time.Second, time.Minute) kubelet.backOff.Clock = fakeClock - kubelet.podKiller = NewPodKiller(kubelet) - go kubelet.podKiller.PerformPodKillingWork() kubelet.resyncInterval = 10 * time.Second kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock) // Relist period does not affect the tests. - kubelet.pleg = pleg.NewGenericPLEG(fakeRuntime, 100, time.Hour, nil, clock.RealClock{}) + kubelet.pleg = pleg.NewGenericPLEG(fakeRuntime, 100, time.Hour, kubelet.podCache, clock.RealClock{}) kubelet.clock = fakeClock nodeRef := &v1.ObjectReference{ @@ -340,7 +339,7 @@ func newTestKubeletWithImageList( controllerAttachDetachEnabled, kubelet.nodeName, kubelet.podManager, - kubelet.statusManager, + kubelet.podWorkers, fakeKubeClient, kubelet.volumePluginMgr, fakeRuntime, @@ -450,10 +449,12 @@ func TestHandlePodCleanupsPerQOS(t *testing.T) { // mark the pod as killed (within this test case). kubelet.HandlePodCleanups() - time.Sleep(2 * time.Second) // assert that unwanted pods were killed - fakeRuntime.AssertKilledPods([]string{"12345678"}) + if actual, expected := kubelet.podWorkers.(*fakePodWorkers).triggeredDeletion, []types.UID{"12345678"}; !reflect.DeepEqual(actual, expected) { + t.Fatalf("expected %v to be deleted, got %v", expected, actual) + } + fakeRuntime.AssertKilledPods([]string(nil)) // simulate Runtime.KillPod fakeRuntime.PodList = nil @@ -461,7 +462,6 @@ func TestHandlePodCleanupsPerQOS(t *testing.T) { kubelet.HandlePodCleanups() kubelet.HandlePodCleanups() kubelet.HandlePodCleanups() - time.Sleep(2 * time.Second) destroyCount := 0 err := wait.Poll(100*time.Millisecond, 10*time.Second, func() (bool, error) { @@ -488,9 +488,11 @@ func TestDispatchWorkOfCompletedPod(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) defer testKubelet.Cleanup() kubelet := testKubelet.kubelet + var got bool kubelet.podWorkers = &fakePodWorkers{ - syncPodFn: func(options syncPodOptions) error { - return fmt.Errorf("should ignore completed pod %q", options.pod.Name) + syncPodFn: func(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + got = true + return nil }, cache: kubelet.podCache, t: t, @@ -554,6 +556,10 @@ func TestDispatchWorkOfCompletedPod(t *testing.T) { } for _, pod := range pods { kubelet.dispatchWork(pod, kubetypes.SyncPodSync, nil, time.Now()) + if !got { + t.Errorf("Should not skip completed pod %q", pod.Name) + } + got = false } } @@ -563,7 +569,7 @@ func TestDispatchWorkOfActivePod(t *testing.T) { kubelet := testKubelet.kubelet var got bool kubelet.podWorkers = &fakePodWorkers{ - syncPodFn: func(options syncPodOptions) error { + syncPodFn: func(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error { got = true return nil }, @@ -631,10 +637,12 @@ func TestHandlePodCleanups(t *testing.T) { kubelet := testKubelet.kubelet kubelet.HandlePodCleanups() - time.Sleep(2 * time.Second) - // assert that unwanted pods were killed - fakeRuntime.AssertKilledPods([]string{"12345678"}) + // assert that unwanted pods were queued to kill + if actual, expected := kubelet.podWorkers.(*fakePodWorkers).triggeredDeletion, []types.UID{"12345678"}; !reflect.DeepEqual(actual, expected) { + t.Fatalf("expected %v to be deleted, got %v", expected, actual) + } + fakeRuntime.AssertKilledPods([]string(nil)) } func TestHandlePodRemovesWhenSourcesAreReady(t *testing.T) { @@ -667,44 +675,17 @@ func TestHandlePodRemovesWhenSourcesAreReady(t *testing.T) { time.Sleep(2 * time.Second) // Sources are not ready yet. Don't remove any pods. - fakeRuntime.AssertKilledPods(nil) + if expect, actual := []types.UID(nil), kubelet.podWorkers.(*fakePodWorkers).triggeredDeletion; !reflect.DeepEqual(expect, actual) { + t.Fatalf("expected %v kills, got %v", expect, actual) + } ready = true kubelet.HandlePodRemoves(pods) time.Sleep(2 * time.Second) // Sources are ready. Remove unwanted pods. - fakeRuntime.AssertKilledPods([]string{"1"}) -} - -func TestKillPodFollwedByIsPodPendingTermination(t *testing.T) { - testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) - defer testKubelet.Cleanup() - - pod := &kubecontainer.Pod{ - ID: "12345678", - Name: "foo", - Namespace: "new", - Containers: []*kubecontainer.Container{ - {Name: "bar"}, - }, - } - - fakeRuntime := testKubelet.fakeRuntime - fakeContainerManager := testKubelet.fakeContainerManager - fakeContainerManager.PodContainerManager.AddPodFromCgroups(pod) // add pod to mock cgroup - fakeRuntime.PodList = []*containertest.FakePod{ - {Pod: pod}, - } - - kl := testKubelet.kubelet - kl.podKiller.KillPod(&kubecontainer.PodPair{ - APIPod: nil, - RunningPod: pod, - }) - - if !(kl.podKiller.IsPodPendingTerminationByUID(pod.ID) || fakeRuntime.AssertKilledPods([]string{"12345678"})) { - t.Fatal("Race condition: When KillPod is complete, the pod should be pending termination or be killed") + if expect, actual := []types.UID{"1"}, kubelet.podWorkers.(*fakePodWorkers).triggeredDeletion; !reflect.DeepEqual(expect, actual) { + t.Fatalf("expected %v kills, got %v", expect, actual) } } @@ -726,6 +707,7 @@ func (nl testNodeLister) List(_ labels.Selector) (ret []*v1.Node, err error) { } func checkPodStatus(t *testing.T, kl *Kubelet, pod *v1.Pod, phase v1.PodPhase) { + t.Helper() status, found := kl.statusManager.GetPodStatus(pod.UID) require.True(t, found, "Status of pod %q is not found in the status map", pod.UID) require.Equal(t, phase, status.Phase) @@ -769,6 +751,10 @@ func TestHandlePortConflicts(t *testing.T) { // The newer pod should be rejected. notfittingPod := pods[0] fittingPod := pods[1] + kl.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{ + pods[0].UID: true, + pods[1].UID: true, + } kl.HandlePodAdditions(pods) @@ -904,6 +890,10 @@ func TestHandleMemExceeded(t *testing.T) { // The newer pod should be rejected. notfittingPod := pods[0] fittingPod := pods[1] + kl.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{ + pods[0].UID: true, + pods[1].UID: true, + } kl.HandlePodAdditions(pods) @@ -1230,11 +1220,7 @@ func TestCreateMirrorPod(t *testing.T) { pod.Annotations[kubetypes.ConfigSourceAnnotationKey] = "file" pods := []*v1.Pod{pod} kl.podManager.SetPods(pods) - err := kl.syncPod(syncPodOptions{ - pod: pod, - podStatus: &kubecontainer.PodStatus{}, - updateType: updateType, - }) + err := kl.syncPod(context.Background(), updateType, pod, nil, &kubecontainer.PodStatus{}) assert.NoError(t, err) podFullName := kubecontainer.GetPodFullName(pod) assert.True(t, manager.HasPod(podFullName), "Expected mirror pod %q to be created", podFullName) @@ -1266,12 +1252,7 @@ func TestDeleteOutdatedMirrorPod(t *testing.T) { pods := []*v1.Pod{pod, mirrorPod} kl.podManager.SetPods(pods) - err := kl.syncPod(syncPodOptions{ - pod: pod, - mirrorPod: mirrorPod, - podStatus: &kubecontainer.PodStatus{}, - updateType: kubetypes.SyncPodUpdate, - }) + err := kl.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, mirrorPod, &kubecontainer.PodStatus{}) assert.NoError(t, err) name := kubecontainer.GetPodFullName(pod) creates, deletes := manager.GetCounts(name) @@ -1309,17 +1290,41 @@ func TestDeleteOrphanedMirrorPods(t *testing.T) { }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + UID: "12345670", + Name: "pod3", + Namespace: "ns", + Annotations: map[string]string{ + kubetypes.ConfigSourceAnnotationKey: "api", + kubetypes.ConfigMirrorAnnotationKey: "mirror", + }, + }, + }, } kl.podManager.SetPods(orphanPods) + + // a static pod that is terminating will not be deleted + kl.podWorkers.(*fakePodWorkers).terminatingStaticPods = map[string]bool{ + kubecontainer.GetPodFullName(orphanPods[2]): true, + } + // Sync with an empty pod list to delete all mirror pods. kl.HandlePodCleanups() assert.Len(t, manager.GetPods(), 0, "Expected 0 mirror pods") - for _, pod := range orphanPods { + for i, pod := range orphanPods { name := kubecontainer.GetPodFullName(pod) creates, deletes := manager.GetCounts(name) - if creates != 0 || deletes != 1 { - t.Errorf("expected 0 creation and one deletion of %q, got %d, %d", name, creates, deletes) + switch i { + case 2: + if creates != 0 || deletes != 0 { + t.Errorf("expected 0 creation and 0 deletion of %q, got %d, %d", name, creates, deletes) + } + default: + if creates != 0 || deletes != 1 { + t.Errorf("expected 0 creation and one deletion of %q, got %d, %d", name, creates, deletes) + } } } } @@ -1404,20 +1409,12 @@ func TestNetworkErrorsWithoutHostNetwork(t *testing.T) { }) kubelet.podManager.SetPods([]*v1.Pod{pod}) - err := kubelet.syncPod(syncPodOptions{ - pod: pod, - podStatus: &kubecontainer.PodStatus{}, - updateType: kubetypes.SyncPodUpdate, - }) + err := kubelet.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{}) assert.Error(t, err, "expected pod with hostNetwork=false to fail when network in error") pod.Annotations[kubetypes.ConfigSourceAnnotationKey] = kubetypes.FileSource pod.Spec.HostNetwork = true - err = kubelet.syncPod(syncPodOptions{ - pod: pod, - podStatus: &kubecontainer.PodStatus{}, - updateType: kubetypes.SyncPodUpdate, - }) + err = kubelet.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{}) assert.NoError(t, err, "expected pod with hostNetwork=true to succeed when network in error") } @@ -1442,6 +1439,12 @@ func TestFilterOutTerminatedPods(t *testing.T) { pods[3].Status.Phase = v1.PodPending pods[4].Status.Phase = v1.PodRunning + kubelet.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{ + pods[2].UID: true, + pods[3].UID: true, + pods[4].UID: true, + } + expected := []*v1.Pod{pods[2], pods[3], pods[4]} kubelet.podManager.SetPods(pods) actual := kubelet.filterOutTerminatedPods(pods) @@ -1586,17 +1589,12 @@ func TestDeletePodDirsForDeletedPods(t *testing.T) { } func syncAndVerifyPodDir(t *testing.T, testKubelet *TestKubelet, pods []*v1.Pod, podsToCheck []*v1.Pod, shouldExist bool) { + t.Helper() kl := testKubelet.kubelet kl.podManager.SetPods(pods) kl.HandlePodSyncs(pods) kl.HandlePodCleanups() - // The first time HandlePodCleanups() is run the pod is placed into the - // podKiller, and bypasses the pod directory cleanup. The pod is - // already killed in the second run to HandlePodCleanups() and will - // cleanup the directories. - time.Sleep(2 * time.Second) - kl.HandlePodCleanups() for i, pod := range podsToCheck { exist := dirExists(kl.getPodDir(pod.UID)) assert.Equal(t, shouldExist, exist, "directory of pod %d", i) @@ -1612,7 +1610,6 @@ func TestDoesNotDeletePodDirsForTerminatedPods(t *testing.T) { podWithUIDNameNs("12345679", "pod2", "ns"), podWithUIDNameNs("12345680", "pod3", "ns"), } - syncAndVerifyPodDir(t, testKubelet, pods, pods, true) // Pod 1 failed, and pod 2 succeeded. None of the pod directories should be // deleted. @@ -1634,6 +1631,7 @@ func TestDoesNotDeletePodDirsIfContainerIsRunning(t *testing.T) { // Sync once to create pod directory; confirm that the pod directory has // already been created. pods := []*v1.Pod{apiPod} + testKubelet.kubelet.podWorkers.(*fakePodWorkers).running = map[types.UID]bool{apiPod.UID: true} syncAndVerifyPodDir(t, testKubelet, pods, []*v1.Pod{apiPod}, true) // Pretend the pod is deleted from apiserver, but is still active on the node. @@ -1646,6 +1644,7 @@ func TestDoesNotDeletePodDirsIfContainerIsRunning(t *testing.T) { // should be removed. pods = []*v1.Pod{} testKubelet.fakeRuntime.PodList = []*containertest.FakePod{} + testKubelet.kubelet.podWorkers.(*fakePodWorkers).running = nil syncAndVerifyPodDir(t, testKubelet, pods, []*v1.Pod{apiPod}, false) } @@ -2233,7 +2232,7 @@ func TestGenerateAPIPodStatusInvokesPodSyncHandlers(t *testing.T) { require.Equal(t, "because", apiStatus.Message) } -func TestSyncPodKillPod(t *testing.T) { +func TestSyncTerminatingPodKillPod(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) defer testKubelet.Cleanup() kl := testKubelet.kubelet @@ -2246,21 +2245,12 @@ func TestSyncPodKillPod(t *testing.T) { } pods := []*v1.Pod{pod} kl.podManager.SetPods(pods) + podStatus := &kubecontainer.PodStatus{ID: pod.UID} gracePeriodOverride := int64(0) - err := kl.syncPod(syncPodOptions{ - pod: pod, - podStatus: &kubecontainer.PodStatus{}, - updateType: kubetypes.SyncPodKill, - killPodOptions: &KillPodOptions{ - PodStatusFunc: func(p *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus { - return v1.PodStatus{ - Phase: v1.PodFailed, - Reason: "reason", - Message: "message", - } - }, - PodTerminationGracePeriodSecondsOverride: &gracePeriodOverride, - }, + err := kl.syncTerminatingPod(context.Background(), pod, podStatus, nil, &gracePeriodOverride, func(podStatus *v1.PodStatus) { + podStatus.Phase = v1.PodFailed + podStatus.Reason = "reason" + podStatus.Message = "message" }) require.NoError(t, err) diff --git a/pkg/kubelet/kubelet_volumes.go b/pkg/kubelet/kubelet_volumes.go index 32ccdaf51f9..5e6809bdc45 100644 --- a/pkg/kubelet/kubelet_volumes.go +++ b/pkg/kubelet/kubelet_volumes.go @@ -186,11 +186,6 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon if allPods.Has(string(uid)) { continue } - // if the pod is within termination grace period, we shouldn't cleanup the underlying volumes - if kl.podKiller.IsPodPendingTerminationByUID(uid) { - klog.V(3).InfoS("Pod is pending termination", "podUID", uid) - continue - } // If volumes have not been unmounted/detached, do not delete directory. // Doing so may result in corruption of data. // TODO: getMountedVolumePathListFromDisk() call may be redundant with diff --git a/pkg/kubelet/kubelet_volumes_test.go b/pkg/kubelet/kubelet_volumes_test.go index 6b7128b4f4a..fa041f709af 100644 --- a/pkg/kubelet/kubelet_volumes_test.go +++ b/pkg/kubelet/kubelet_volumes_test.go @@ -315,9 +315,14 @@ func TestVolumeUnmountAndDetachControllerDisabled(t *testing.T) { 1 /* expectedSetUpCallCount */, testKubelet.volumePlugin)) // Remove pod + // TODO: this may not be threadsafe (technically waitForVolumeUnmount) + kubelet.podWorkers.(*fakePodWorkers).removeRuntime = map[types.UID]bool{pod.UID: true} kubelet.podManager.SetPods([]*v1.Pod{}) - assert.NoError(t, waitForVolumeUnmount(kubelet.volumeManager, pod)) + assert.NoError(t, kubelet.volumeManager.WaitForUnmount(pod)) + if actual := kubelet.volumeManager.GetMountedVolumesForPod(util.GetUniquePodName(pod)); len(actual) > 0 { + t.Fatalf("expected volume unmount to wait for no volumes: %v", actual) + } // Verify volumes unmounted podVolumes = kubelet.volumeManager.GetMountedVolumesForPod( @@ -499,6 +504,7 @@ func TestVolumeUnmountAndDetachControllerEnabled(t *testing.T) { 1 /* expectedSetUpCallCount */, testKubelet.volumePlugin)) // Remove pod + kubelet.podWorkers.(*fakePodWorkers).removeRuntime = map[types.UID]bool{pod.UID: true} kubelet.podManager.SetPods([]*v1.Pod{}) assert.NoError(t, waitForVolumeUnmount(kubelet.volumeManager, pod)) diff --git a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go index bf6cec514ea..3e02ec3a3ea 100644 --- a/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go @@ -51,25 +51,25 @@ func (f *fakeHTTP) Get(url string) (*http.Response, error) { } type fakePodStateProvider struct { - existingPods map[types.UID]struct{} - runningPods map[types.UID]struct{} + terminated map[types.UID]struct{} + removed map[types.UID]struct{} } func newFakePodStateProvider() *fakePodStateProvider { return &fakePodStateProvider{ - existingPods: make(map[types.UID]struct{}), - runningPods: make(map[types.UID]struct{}), + terminated: make(map[types.UID]struct{}), + removed: make(map[types.UID]struct{}), } } -func (f *fakePodStateProvider) IsPodDeleted(uid types.UID) bool { - _, found := f.existingPods[uid] - return !found +func (f *fakePodStateProvider) ShouldPodRuntimeBeRemoved(uid types.UID) bool { + _, found := f.terminated[uid] + return found } -func (f *fakePodStateProvider) IsPodTerminated(uid types.UID) bool { - _, found := f.runningPods[uid] - return !found +func (f *fakePodStateProvider) ShouldPodContentBeRemoved(uid types.UID) bool { + _, found := f.removed[uid] + return found } func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageService internalapi.ImageManagerService, machineInfo *cadvisorapi.MachineInfo, osInterface kubecontainer.OSInterface, runtimeHelper kubecontainer.RuntimeHelper, keyring credentialprovider.DockerKeyring) (*kubeGenericRuntimeManager, error) { diff --git a/pkg/kubelet/kuberuntime/kuberuntime_gc.go b/pkg/kubelet/kuberuntime/kuberuntime_gc.go index a93cc5a8876..1d522a2faa2 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_gc.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_gc.go @@ -220,7 +220,7 @@ func (cgc *containerGC) evictableContainers(minAge time.Duration) (containersByE } // evict all containers that are evictable -func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictTerminatedPods bool) error { +func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error { // Separate containers by evict units. evictUnits, err := cgc.evictableContainers(gcPolicy.MinAge) if err != nil { @@ -230,7 +230,7 @@ func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSour // Remove deleted pod containers if all sources are ready. if allSourcesReady { for key, unit := range evictUnits { - if cgc.podStateProvider.IsPodDeleted(key.uid) || (cgc.podStateProvider.IsPodTerminated(key.uid) && evictTerminatedPods) { + if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) { cgc.removeOldestN(unit, len(unit)) // Remove all. delete(evictUnits, key) } @@ -272,7 +272,7 @@ func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.GCPolicy, allSour // 2. contains no containers. // 3. belong to a non-existent (i.e., already removed) pod, or is not the // most recently created sandbox for the pod. -func (cgc *containerGC) evictSandboxes(evictTerminatedPods bool) error { +func (cgc *containerGC) evictSandboxes(evictNonDeletedPods bool) error { containers, err := cgc.manager.getKubeletContainers(true) if err != nil { return err @@ -311,7 +311,7 @@ func (cgc *containerGC) evictSandboxes(evictTerminatedPods bool) error { } for podUID, sandboxes := range sandboxesByPod { - if cgc.podStateProvider.IsPodDeleted(podUID) || (cgc.podStateProvider.IsPodTerminated(podUID) && evictTerminatedPods) { + if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) { // Remove all evictable sandboxes if the pod has been removed. // Note that the latest dead sandbox is also removed if there is // already an active one. @@ -337,9 +337,10 @@ func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error { for _, dir := range dirs { name := dir.Name() podUID := parsePodUIDFromLogsDirectory(name) - if !cgc.podStateProvider.IsPodDeleted(podUID) { + if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) { continue } + klog.V(4).InfoS("Removing pod logs", "podUID", podUID) err := osInterface.RemoveAll(filepath.Join(podLogsRootDirectory, name)) if err != nil { klog.ErrorS(err, "Failed to remove pod logs directory", "path", name) @@ -397,15 +398,15 @@ func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error { // * removes oldest dead containers by enforcing gcPolicy.MaxContainers. // * gets evictable sandboxes which are not ready and contains no containers. // * removes evictable sandboxes. -func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictTerminatedPods bool) error { +func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error { errors := []error{} // Remove evictable containers - if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictTerminatedPods); err != nil { + if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil { errors = append(errors, err) } // Remove sandboxes with zero containers - if err := cgc.evictSandboxes(evictTerminatedPods); err != nil { + if err := cgc.evictSandboxes(evictNonDeletedPods); err != nil { errors = append(errors, err) } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go b/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go index ad6d4a655db..a88db2d442d 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go @@ -24,7 +24,8 @@ import ( "github.com/golang/mock/gomock" "github.com/stretchr/testify/assert" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" @@ -35,16 +36,14 @@ func TestSandboxGC(t *testing.T) { assert.NoError(t, err) podStateProvider := m.containerGC.podStateProvider.(*fakePodStateProvider) - makeGCSandbox := func(pod *v1.Pod, attempt uint32, state runtimeapi.PodSandboxState, withPodStateProvider bool, createdAt int64) sandboxTemplate { - if withPodStateProvider { - // initialize the pod getter - podStateProvider.existingPods[pod.UID] = struct{}{} - } + makeGCSandbox := func(pod *v1.Pod, attempt uint32, state runtimeapi.PodSandboxState, hasRunningContainers, isTerminating bool, createdAt int64) sandboxTemplate { return sandboxTemplate{ - pod: pod, - state: state, - attempt: attempt, - createdAt: createdAt, + pod: pod, + state: state, + attempt: attempt, + createdAt: createdAt, + running: hasRunningContainers, + terminating: isTerminating, } } @@ -61,133 +60,138 @@ func TestSandboxGC(t *testing.T) { }), } - for c, test := range []struct { - description string // description of the test case - sandboxes []sandboxTemplate // templates of sandboxes - containers []containerTemplate // templates of containers - remain []int // template indexes of remaining sandboxes - evictTerminatedPods bool + for _, test := range []struct { + description string // description of the test case + sandboxes []sandboxTemplate // templates of sandboxes + containers []containerTemplate // templates of containers + remain []int // template indexes of remaining sandboxes + evictTerminatingPods bool }{ { description: "notready sandboxes without containers for deleted pods should be garbage collected.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, 0), + makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, false, 0), }, - containers: []containerTemplate{}, - remain: []int{}, - evictTerminatedPods: false, + containers: []containerTemplate{}, + remain: []int{}, + evictTerminatingPods: false, }, { description: "ready sandboxes without containers for deleted pods should not be garbage collected.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_READY, false, 0), + makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_READY, false, false, 0), }, - containers: []containerTemplate{}, - remain: []int{0}, - evictTerminatedPods: false, + containers: []containerTemplate{}, + remain: []int{0}, + evictTerminatingPods: false, }, { description: "sandboxes for existing pods should not be garbage collected.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, 0), - makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0), + makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, false, 0), + makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 0), }, - containers: []containerTemplate{}, - remain: []int{0, 1}, - evictTerminatedPods: false, + containers: []containerTemplate{}, + remain: []int{0, 1}, + evictTerminatingPods: false, }, { description: "older exited sandboxes without containers for existing pods should be garbage collected if there are more than one exited sandboxes.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[0], 1, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 1), - makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0), + makeGCSandbox(pods[0], 1, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 1), + makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 0), }, - containers: []containerTemplate{}, - remain: []int{0}, - evictTerminatedPods: false, + containers: []containerTemplate{}, + remain: []int{0}, + evictTerminatingPods: false, }, { description: "older exited sandboxes with containers for existing pods should not be garbage collected even if there are more than one exited sandboxes.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[0], 1, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 1), - makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0), + makeGCSandbox(pods[0], 1, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 1), + makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 0), }, containers: []containerTemplate{ {pod: pods[0], container: &pods[0].Spec.Containers[0], sandboxAttempt: 0, state: runtimeapi.ContainerState_CONTAINER_EXITED}, }, - remain: []int{0, 1}, - evictTerminatedPods: false, + remain: []int{0, 1}, + evictTerminatingPods: false, }, { - description: "non-running sandboxes for existing pods should be garbage collected if evictTerminatedPods is set.", + description: "non-running sandboxes for existing pods should be garbage collected if evictTerminatingPods is set.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, 0), - makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0), + makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, true, 0), + makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, true, 0), }, - containers: []containerTemplate{}, - remain: []int{0}, - evictTerminatedPods: true, + containers: []containerTemplate{}, + remain: []int{0}, + evictTerminatingPods: true, }, { description: "sandbox with containers should not be garbage collected.", sandboxes: []sandboxTemplate{ - makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, 0), + makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, false, 0), }, containers: []containerTemplate{ {pod: pods[0], container: &pods[0].Spec.Containers[0], state: runtimeapi.ContainerState_CONTAINER_EXITED}, }, - remain: []int{0}, - evictTerminatedPods: false, + remain: []int{0}, + evictTerminatingPods: false, }, { description: "multiple sandboxes should be handled properly.", sandboxes: []sandboxTemplate{ // running sandbox. - makeGCSandbox(pods[0], 1, runtimeapi.PodSandboxState_SANDBOX_READY, true, 1), + makeGCSandbox(pods[0], 1, runtimeapi.PodSandboxState_SANDBOX_READY, true, false, 1), // exited sandbox without containers. - makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0), + makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 0), // exited sandbox with containers. - makeGCSandbox(pods[1], 1, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 1), + makeGCSandbox(pods[1], 1, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 1), // exited sandbox without containers. - makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0), + makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, false, 0), // exited sandbox without containers for deleted pods. - makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, 0), + makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, true, 0), }, containers: []containerTemplate{ {pod: pods[1], container: &pods[1].Spec.Containers[0], sandboxAttempt: 1, state: runtimeapi.ContainerState_CONTAINER_EXITED}, }, - remain: []int{0, 2}, - evictTerminatedPods: false, + remain: []int{0, 2}, + evictTerminatingPods: false, }, } { - t.Logf("TestCase #%d: %+v", c, test) - fakeSandboxes := makeFakePodSandboxes(t, m, test.sandboxes) - fakeContainers := makeFakeContainers(t, m, test.containers) - fakeRuntime.SetFakeSandboxes(fakeSandboxes) - fakeRuntime.SetFakeContainers(fakeContainers) + t.Run(test.description, func(t *testing.T) { + podStateProvider.removed = make(map[types.UID]struct{}) + podStateProvider.terminated = make(map[types.UID]struct{}) + fakeSandboxes := makeFakePodSandboxes(t, m, test.sandboxes) + fakeContainers := makeFakeContainers(t, m, test.containers) + for _, s := range test.sandboxes { + if !s.running && s.pod.Name == "deleted" { + podStateProvider.removed[s.pod.UID] = struct{}{} + } + if s.terminating { + podStateProvider.terminated[s.pod.UID] = struct{}{} + } + } + fakeRuntime.SetFakeSandboxes(fakeSandboxes) + fakeRuntime.SetFakeContainers(fakeContainers) - err := m.containerGC.evictSandboxes(test.evictTerminatedPods) - assert.NoError(t, err) - realRemain, err := fakeRuntime.ListPodSandbox(nil) - assert.NoError(t, err) - assert.Len(t, realRemain, len(test.remain)) - for _, remain := range test.remain { - status, err := fakeRuntime.PodSandboxStatus(fakeSandboxes[remain].Id) + err := m.containerGC.evictSandboxes(test.evictTerminatingPods) assert.NoError(t, err) - assert.Equal(t, &fakeSandboxes[remain].PodSandboxStatus, status) - } + realRemain, err := fakeRuntime.ListPodSandbox(nil) + assert.NoError(t, err) + assert.Len(t, realRemain, len(test.remain)) + for _, remain := range test.remain { + status, err := fakeRuntime.PodSandboxStatus(fakeSandboxes[remain].Id) + assert.NoError(t, err) + assert.Equal(t, &fakeSandboxes[remain].PodSandboxStatus, status) + } + }) } } -func makeGCContainer(podStateProvider *fakePodStateProvider, podName, containerName string, attempt int, createdAt int64, state runtimeapi.ContainerState) containerTemplate { +func makeGCContainer(podName, containerName string, attempt int, createdAt int64, state runtimeapi.ContainerState) containerTemplate { container := makeTestContainer(containerName, "test-image") pod := makeTestPod(podName, "test-ns", podName, []v1.Container{container}) - if podName == "running" { - podStateProvider.runningPods[pod.UID] = struct{}{} - } - if podName != "deleted" { - podStateProvider.existingPods[pod.UID] = struct{}{} - } return containerTemplate{ pod: pod, container: &container, @@ -204,201 +208,212 @@ func TestContainerGC(t *testing.T) { podStateProvider := m.containerGC.podStateProvider.(*fakePodStateProvider) defaultGCPolicy := kubecontainer.GCPolicy{MinAge: time.Hour, MaxPerPodContainer: 2, MaxContainers: 6} - for c, test := range []struct { - description string // description of the test case - containers []containerTemplate // templates of containers - policy *kubecontainer.GCPolicy // container gc policy - remain []int // template indexes of remaining containers - evictTerminatedPods bool - allSourcesReady bool + for _, test := range []struct { + description string // description of the test case + containers []containerTemplate // templates of containers + policy *kubecontainer.GCPolicy // container gc policy + remain []int // template indexes of remaining containers + evictTerminatingPods bool + allSourcesReady bool }{ { description: "all containers should be removed when max container limit is 0", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - policy: &kubecontainer.GCPolicy{MinAge: time.Minute, MaxPerPodContainer: 1, MaxContainers: 0}, - remain: []int{}, - evictTerminatedPods: false, - allSourcesReady: true, + policy: &kubecontainer.GCPolicy{MinAge: time.Minute, MaxPerPodContainer: 1, MaxContainers: 0}, + remain: []int{}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "max containers should be complied when no max per pod container limit is set", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 4, 4, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 3, 3, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 4, 4, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 3, 3, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - policy: &kubecontainer.GCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: 4}, - remain: []int{0, 1, 2, 3}, - evictTerminatedPods: false, - allSourcesReady: true, + policy: &kubecontainer.GCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: 4}, + remain: []int{0, 1, 2, 3}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "no containers should be removed if both max container and per pod container limits are not set", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - policy: &kubecontainer.GCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: -1}, - remain: []int{0, 1, 2}, - evictTerminatedPods: false, - allSourcesReady: true, + policy: &kubecontainer.GCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: -1}, + remain: []int{0, 1, 2}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "recently started containers should not be removed", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 2, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 1, 2}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 1, 2}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "oldest containers should be removed when per pod container limit exceeded", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 1}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 1}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "running containers should not be removed", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING), }, - remain: []int{0, 1, 2}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 1, 2}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "no containers should be removed when limits are not exceeded", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 1}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 1}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "max container count should apply per (UID, container) pair", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "baz", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "baz", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "baz", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo2", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo2", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo2", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "baz", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "baz", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "baz", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo2", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo2", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo2", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 1, 3, 4, 6, 7}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 1, 3, 4, 6, 7}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "max limit should apply and try to keep from every pod", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo2", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo2", "bar2", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo3", "bar3", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo4", "bar4", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo4", "bar4", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo2", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo2", "bar2", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo3", "bar3", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo4", "bar4", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo4", "bar4", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 2, 4, 6, 8}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 2, 4, 6, 8}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "oldest pods should be removed if limit exceeded", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "bar1", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo2", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo4", "bar4", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo5", "bar5", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo6", "bar6", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo7", "bar7", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "bar1", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo2", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo4", "bar4", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo5", "bar5", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo6", "bar6", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo7", "bar7", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 2, 4, 6, 8, 9}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 2, 4, 6, 8, 9}, + evictTerminatingPods: false, + allSourcesReady: true, }, { - description: "all non-running containers should be removed when evictTerminatedPods is set", + description: "all non-running containers should be removed when evictTerminatingPods is set", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "bar1", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "running", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING), + makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "bar1", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("running", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING), }, - remain: []int{4, 5}, - evictTerminatedPods: true, - allSourcesReady: true, + remain: []int{4, 5}, + evictTerminatingPods: true, + allSourcesReady: true, }, { description: "containers for deleted pods should be removed", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), // deleted pods still respect MinAge. - makeGCContainer(podStateProvider, "deleted", "bar1", 2, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "deleted", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), - makeGCContainer(podStateProvider, "deleted", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("deleted", "bar1", 2, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("deleted", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("deleted", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0, 1, 2}, - evictTerminatedPods: false, - allSourcesReady: true, + remain: []int{0, 1, 2}, + evictTerminatingPods: false, + allSourcesReady: true, }, { description: "containers for deleted pods may not be removed if allSourcesReady is set false ", containers: []containerTemplate{ - makeGCContainer(podStateProvider, "deleted", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), + makeGCContainer("deleted", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED), }, - remain: []int{0}, - evictTerminatedPods: true, - allSourcesReady: false, + remain: []int{0}, + evictTerminatingPods: true, + allSourcesReady: false, }, } { - t.Logf("TestCase #%d: %+v", c, test) - fakeContainers := makeFakeContainers(t, m, test.containers) - fakeRuntime.SetFakeContainers(fakeContainers) + t.Run(test.description, func(t *testing.T) { + podStateProvider.removed = make(map[types.UID]struct{}) + podStateProvider.terminated = make(map[types.UID]struct{}) + fakeContainers := makeFakeContainers(t, m, test.containers) + for _, s := range test.containers { + if s.pod.Name == "deleted" { + podStateProvider.removed[s.pod.UID] = struct{}{} + } + if s.pod.Name != "running" { + podStateProvider.terminated[s.pod.UID] = struct{}{} + } + } + fakeRuntime.SetFakeContainers(fakeContainers) - if test.policy == nil { - test.policy = &defaultGCPolicy - } - err := m.containerGC.evictContainers(*test.policy, test.allSourcesReady, test.evictTerminatedPods) - assert.NoError(t, err) - realRemain, err := fakeRuntime.ListContainers(nil) - assert.NoError(t, err) - assert.Len(t, realRemain, len(test.remain)) - for _, remain := range test.remain { - status, err := fakeRuntime.ContainerStatus(fakeContainers[remain].Id) + if test.policy == nil { + test.policy = &defaultGCPolicy + } + err := m.containerGC.evictContainers(*test.policy, test.allSourcesReady, test.evictTerminatingPods) assert.NoError(t, err) - assert.Equal(t, &fakeContainers[remain].ContainerStatus, status) - } + realRemain, err := fakeRuntime.ListContainers(nil) + assert.NoError(t, err) + assert.Len(t, realRemain, len(test.remain)) + for _, remain := range test.remain { + status, err := fakeRuntime.ContainerStatus(fakeContainers[remain].Id) + assert.NoError(t, err) + assert.Equal(t, &fakeContainers[remain].ContainerStatus, status) + } + }) } } @@ -410,18 +425,15 @@ func TestPodLogDirectoryGC(t *testing.T) { podStateProvider := m.containerGC.podStateProvider.(*fakePodStateProvider) // pod log directories without corresponding pods should be removed. - podStateProvider.existingPods["123"] = struct{}{} - podStateProvider.existingPods["456"] = struct{}{} - podStateProvider.existingPods["321"] = struct{}{} - podStateProvider.runningPods["123"] = struct{}{} - podStateProvider.runningPods["456"] = struct{}{} - podStateProvider.existingPods["321"] = struct{}{} files := []string{"123", "456", "789", "012", "name_namespace_321", "name_namespace_654"} removed := []string{ filepath.Join(podLogsRootDirectory, "789"), filepath.Join(podLogsRootDirectory, "012"), filepath.Join(podLogsRootDirectory, "name_namespace_654"), } + podStateProvider.removed["012"] = struct{}{} + podStateProvider.removed["789"] = struct{}{} + podStateProvider.removed["654"] = struct{}{} ctrl := gomock.NewController(t) defer ctrl.Finish() @@ -452,11 +464,11 @@ func TestUnknownStateContainerGC(t *testing.T) { fakeRuntime, _, m, err := createTestRuntimeManager() assert.NoError(t, err) - podStateProvider := m.containerGC.podStateProvider.(*fakePodStateProvider) + // podStateProvider := m.containerGC.podStateProvider.(*fakePodStateProvider) defaultGCPolicy := kubecontainer.GCPolicy{MinAge: time.Hour, MaxPerPodContainer: 0, MaxContainers: 0} fakeContainers := makeFakeContainers(t, m, []containerTemplate{ - makeGCContainer(podStateProvider, "foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_UNKNOWN), + makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_UNKNOWN), }) fakeRuntime.SetFakeContainers(fakeContainers) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index c9bf6a847ee..ee0173904f5 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -74,10 +74,11 @@ var ( ErrVersionNotSupported = errors.New("runtime api version is not supported") ) -// podStateProvider can determine if a pod is deleted ir terminated +// podStateProvider can determine if none of the elements are necessary to retain (pod content) +// or if none of the runtime elements are necessary to retain (containers) type podStateProvider interface { - IsPodDeleted(kubetypes.UID) bool - IsPodTerminated(kubetypes.UID) bool + ShouldPodContentBeRemoved(kubetypes.UID) bool + ShouldPodRuntimeBeRemoved(kubetypes.UID) bool } type kubeGenericRuntimeManager struct { @@ -787,7 +788,8 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine // or CRI if the Pod has been deleted while the POD is // being created. If the pod has been deleted then it's // not a real error. - if m.podStateProvider.IsPodDeleted(pod.UID) { + // TODO: this is probably not needed now that termination is part of the sync loop + if m.podStateProvider.ShouldPodContentBeRemoved(pod.UID) { klog.V(4).InfoS("Pod was deleted and sandbox failed to be created", "pod", klog.KObj(pod), "podUID", pod.UID) return } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go index 2859457614d..91f6ab5d85d 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go @@ -66,10 +66,12 @@ func customTestRuntimeManager(keyring *credentialprovider.BasicDockerKeyring) (* // sandboxTemplate is a sandbox template to create fake sandbox. type sandboxTemplate struct { - pod *v1.Pod - attempt uint32 - createdAt int64 - state runtimeapi.PodSandboxState + pod *v1.Pod + attempt uint32 + createdAt int64 + state runtimeapi.PodSandboxState + running bool + terminating bool } // containerTemplate is a container template to create fake container. @@ -1397,6 +1399,7 @@ func TestSyncPodWithSandboxAndDeletedPod(t *testing.T) { } backOff := flowcontrol.NewBackOff(time.Second, time.Minute) + m.podStateProvider.(*fakePodStateProvider).removed = map[types.UID]struct{}{pod.UID: {}} // GetPodStatus and the following SyncPod will not return errors in the // case where the pod has been deleted. We are not adding any pods into diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go index 3c33f56f5c6..27c2cee090f 100644 --- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go @@ -62,7 +62,7 @@ type Manager struct { shutdownGracePeriodCriticalPods time.Duration getPods eviction.ActivePodsFunc - killPod eviction.KillPodFunc + killPodFunc eviction.KillPodFunc syncNodeStatus func() dbusCon dbusInhibiter @@ -78,7 +78,7 @@ type Manager struct { func NewManager(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, syncNodeStatus func(), shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) (*Manager, lifecycle.PodAdmitHandler) { manager := &Manager{ getPods: getPodsFunc, - killPod: killPodFunc, + killPodFunc: killPodFunc, syncNodeStatus: syncNodeStatus, shutdownGracePeriodRequested: shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods: shutdownGracePeriodCriticalPods, @@ -268,15 +268,10 @@ func (m *Manager) processShutdownEvent() error { } klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride) - - status := v1.PodStatus{ - Phase: v1.PodFailed, - Reason: nodeShutdownReason, - Message: nodeShutdownMessage, - } - - err := m.killPod(pod, status, &gracePeriodOverride) - if err != nil { + if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) { + status.Message = nodeShutdownMessage + status.Reason = nodeShutdownReason + }); err != nil { klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err) } else { klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod)) diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go index 871f491c1c0..b8cef9807d1 100644 --- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go @@ -212,7 +212,7 @@ func TestManager(t *testing.T) { } podKillChan := make(chan PodKillInfo, 1) - killPodsFunc := func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { + killPodsFunc := func(pod *v1.Pod, evict bool, gracePeriodOverride *int64, fn func(podStatus *v1.PodStatus)) error { var gracePeriod int64 if gracePeriodOverride != nil { gracePeriod = *gracePeriodOverride @@ -297,7 +297,7 @@ func TestFeatureEnabled(t *testing.T) { activePodsFunc := func() []*v1.Pod { return nil } - killPodsFunc := func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { + killPodsFunc := func(pod *v1.Pod, evict bool, gracePeriodOverride *int64, fn func(*v1.PodStatus)) error { return nil } defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.GracefulNodeShutdown, tc.featureGateEnabled)() @@ -323,7 +323,7 @@ func TestRestart(t *testing.T) { activePodsFunc := func() []*v1.Pod { return nil } - killPodsFunc := func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { + killPodsFunc := func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, fn func(*v1.PodStatus)) error { return nil } syncNodeStatus := func() {} diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 215a7a155d6..31ceb6ae8cc 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -19,7 +19,7 @@ package pod import ( "sync" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/kubernetes/pkg/kubelet/configmap" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" diff --git a/pkg/kubelet/pod/testing/mock_manager.go b/pkg/kubelet/pod/testing/mock_manager.go index 5bfffd864a4..b3781c8a0cf 100644 --- a/pkg/kubelet/pod/testing/mock_manager.go +++ b/pkg/kubelet/pod/testing/mock_manager.go @@ -19,11 +19,13 @@ package testing import ( kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" -) -import mock "github.com/stretchr/testify/mock" -import types "k8s.io/apimachinery/pkg/types" -import v1 "k8s.io/api/core/v1" + mock "github.com/stretchr/testify/mock" + + types "k8s.io/apimachinery/pkg/types" + + v1 "k8s.io/api/core/v1" +) // MockManager is an autogenerated mock type for the Manager type type MockManager struct { @@ -77,6 +79,20 @@ func (_m *MockManager) GetOrphanedMirrorPodNames() []string { return r0 } +func (_m *MockManager) GetOrphanedMirrorPodNamesAndUIDs() map[string]kubelettypes.ResolvedPodUID { + ret := _m.Called() + + var r0 map[string]kubelettypes.ResolvedPodUID + if rf, ok := ret.Get(0).(func() map[string]kubelettypes.ResolvedPodUID); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]kubelettypes.ResolvedPodUID) + } + } + return r0 +} + // DeletePod provides a mock function with given fields: _a0 func (_m *MockManager) DeletePod(_a0 *v1.Pod) { _m.Called(_a0) diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go index f39f7b73f1a..9cfb169aca9 100644 --- a/pkg/kubelet/pod_workers.go +++ b/pkg/kubelet/pod_workers.go @@ -17,21 +17,23 @@ limitations under the License. package kubelet import ( + "context" "fmt" "strings" "sync" "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/eviction" + "k8s.io/kubernetes/pkg/kubelet/metrics" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/queue" ) @@ -40,12 +42,21 @@ import ( // If err is non-nil, the operation did not complete successfully. type OnCompleteFunc func(err error) -// PodStatusFunc is a function that is invoked to generate a pod status. -type PodStatusFunc func(pod *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus +// PodStatusFunc is a function that is invoked to override the pod status when a pod is killed. +type PodStatusFunc func(podStatus *v1.PodStatus) // KillPodOptions are options when performing a pod update whose update type is kill. type KillPodOptions struct { - // PodStatusFunc is the function to invoke to set pod status in response to a kill request. + // CompletedCh is closed when the kill request completes (syncTerminatingPod has completed + // without error) or if the pod does not exist, or if the pod has already terminated. This + // could take an arbitrary amount of time to be closed, but is never left open once + // CouldHaveRunningContainers() returns false. + CompletedCh chan<- struct{} + // Evict is true if this is a pod triggered eviction - once a pod is evicted some resources are + // more aggressively reaped than during normal pod operation (stopped containers). + Evict bool + // PodStatusFunc is invoked (if set) and overrides the status of the pod at the time the pod is killed. + // The provided status is populated from the latest state. PodStatusFunc PodStatusFunc // PodTerminationGracePeriodSecondsOverride is optional override to use if a pod is being killed as part of kill operation. PodTerminationGracePeriodSecondsOverride *int64 @@ -53,45 +64,139 @@ type KillPodOptions struct { // UpdatePodOptions is an options struct to pass to a UpdatePod operation. type UpdatePodOptions struct { - // pod to update - Pod *v1.Pod - // the mirror pod for the pod to update, if it is a static pod - MirrorPod *v1.Pod - // the type of update (create, update, sync, kill) + // The type of update (create, update, sync, kill). UpdateType kubetypes.SyncPodType - // optional callback function when operation completes - // this callback is not guaranteed to be completed since a pod worker may - // drop update requests if it was fulfilling a previous request. this is - // only guaranteed to be invoked in response to a kill pod request which is - // always delivered. - OnCompleteFunc OnCompleteFunc - // if update type is kill, use the specified options to kill the pod. + // StartTime is an optional timestamp for when this update was created. If set, + // when this update is fully realized by the pod worker it will be recorded in + // the PodWorkerDuration metric. + StartTime time.Time + // Pod to update. Required. + Pod *v1.Pod + // MirrorPod is the mirror pod if Pod is a static pod. Optional when UpdateType + // is kill or terminated. + MirrorPod *v1.Pod + // RunningPod is a runtime pod that is no longer present in config. Required + // if Pod is nil, ignored if Pod is set. + RunningPod *kubecontainer.Pod + // KillPodOptions is used to override the default termination behavior of the + // pod or to update the pod status after an operation is completed. Since a + // pod can be killed for multiple reasons, PodStatusFunc is invoked in order + // and later kills have an opportunity to override the status (i.e. a preemption + // may be later turned into an eviction). KillPodOptions *KillPodOptions } +// PodWorkType classifies the three phases of pod lifecycle - setup (sync), +// teardown of containers (terminating), cleanup (terminated). +type PodWorkType int + +const ( + // SyncPodSync is when the pod is expected to be started and running. + SyncPodWork PodWorkType = iota + // TerminatingPodWork is when the pod is no longer being set up, but some + // containers may be running and are being torn down. + TerminatingPodWork + // TerminatedPodWork indicates the pod is stopped, can have no more running + // containers, and any foreground cleanup can be executed. + TerminatedPodWork +) + +// podWork is the internal changes +type podWork struct { + // WorkType is the type of sync to perform - sync (create), terminating (stop + // containers), terminated (clean up and write status). + WorkType PodWorkType + + // Options contains the data to sync. + Options UpdatePodOptions +} + // PodWorkers is an abstract interface for testability. type PodWorkers interface { - UpdatePod(options *UpdatePodOptions) - ForgetNonExistingPodWorkers(desiredPods map[types.UID]sets.Empty) - ForgetWorker(uid types.UID) + // UpdatePod notifies the pod worker of a change to a pod, which will then + // be processed in FIFO order by a goroutine per pod UID. The state of the + // pod will be passed to the syncPod method until either the pod is marked + // as deleted, it reaches a terminal phase (Succeeded/Failed), or the pod + // is evicted by the kubelet. Once that occurs the syncTerminatingPod method + // will be called until it exits successfully, and after that all further + // UpdatePod() calls will be ignored for that pod until it has been forgotten + // due to significant time passing. A pod that is terminated will never be + // restarted. + UpdatePod(options UpdatePodOptions) + // SyncKnownPods removes workers for pods that are not in the desiredPods set + // and have been terminated for a significant period of time. Once this method + // has been called once, the workers are assumed to be fully initialized and + // subsequent calls to ShouldPodContentBeRemoved on unknown pods will return + // true. + SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkType + + // CouldHaveRunningContainers returns true before the pod workers have synced, + // once the pod workers see the pod (syncPod could be called), and returns false + // after the pod has been terminated (running containers guaranteed stopped). + // + // Intended for use by the kubelet config loops, but not subsystems, which should + // use ShouldPod*(). + CouldHaveRunningContainers(uid types.UID) bool + // IsPodTerminationRequested returns true when pod termination has been requested + // until the termination completes and the pod is removed from config. This should + // not be used in cleanup loops because it will return false if the pod has already + // been cleaned up - use ShouldPodContainersBeTerminating instead. Also, this method + // may return true while containers are still being initialized by the pod worker. + // + // Intended for use by the kubelet sync* methods, but not subsystems, which should + // use ShouldPod*(). + IsPodTerminationRequested(uid types.UID) bool + + // ShouldPodContainersBeTerminating returns false before pod workers have synced, + // or once a pod has started terminating. This check is similar to + // ShouldPodRuntimeBeRemoved but is also true after pod termination is requested. + // + // Intended for use by subsystem sync loops to avoid performing background setup + // after termination has been requested for a pod. Callers must ensure that the + // syncPod method is non-blocking when their data is absent. + ShouldPodContainersBeTerminating(uid types.UID) bool + // ShouldPodRuntimeBeRemoved returns true if runtime managers within the Kubelet + // should aggressively cleanup pod resources that are not containers or on disk + // content, like attached volumes. This is true when a pod is not yet observed + // by a worker after the first sync (meaning it can't be running yet) or after + // all running containers are stopped. + // TODO: Once pod logs are separated from running containers, this method should + // be used to gate whether containers are kept. + // + // Intended for use by subsystem sync loops to know when to start tearing down + // resources that are used by running containers. Callers should ensure that + // runtime content they own is not required for post-termination - for instance + // containers are required in docker to preserve pod logs until after the pod + // is deleted. + ShouldPodRuntimeBeRemoved(uid types.UID) bool + // ShouldPodContentBeRemoved returns true if resource managers within the Kubelet + // should aggressively cleanup all content related to the pod. This is true + // during pod eviction (when we wish to remove that content to free resources) + // as well as after the request to delete a pod has resulted in containers being + // stopped (which is a more graceful action). Note that a deleting pod can still + // be evicted. + // + // Intended for use by subsystem sync loops to know when to start tearing down + // resources that are used by non-deleted pods. Content is generally preserved + // until deletion+removal_from_etcd or eviction, although garbage collection + // can free content when this method returns false. + ShouldPodContentBeRemoved(uid types.UID) bool + // IsPodForMirrorPodTerminatingByFullName returns true if a static pod with the + // provided pod name is currently terminating and has yet to complete. It is + // intended to be used only during orphan mirror pod cleanup to prevent us from + // deleting a terminating static pod from the apiserver before the pod is shut + // down. + IsPodForMirrorPodTerminatingByFullName(podFullname string) bool } -// syncPodOptions provides the arguments to a SyncPod operation. -type syncPodOptions struct { - // the mirror pod for the pod to sync, if it is a static pod - mirrorPod *v1.Pod - // pod to sync - pod *v1.Pod - // the type of update (create, update, sync) - updateType kubetypes.SyncPodType - // the current status - podStatus *kubecontainer.PodStatus - // if update type is kill, use the specified options to kill the pod. - killPodOptions *KillPodOptions -} +// the function to invoke to perform a sync (reconcile the kubelet state to the desired shape of the pod) +type syncPodFnType func(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error -// the function to invoke to perform a sync. -type syncPodFnType func(options syncPodOptions) error +// the function to invoke to terminate a pod (ensure no running processes are present) +type syncTerminatingPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error + +// the function to invoke to cleanup a pod that is terminated +type syncTerminatedPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error const ( // jitter factor for resyncInterval @@ -104,27 +209,152 @@ const ( backOffOnTransientErrorPeriod = time.Second ) +// podSyncStatus tracks per-pod transitions through the three phases of pod +// worker sync (setup, terminating, terminated). +type podSyncStatus struct { + // ctx is the context that is associated with the current pod sync. + ctx context.Context + // cancelFn if set is expected to cancel the current sync*Pod operation. + cancelFn context.CancelFunc + // working is true if a pod worker is currently in a sync method. + working bool + + // syncedAt is the time at which the pod worker first observed this pod. + syncedAt time.Time + // terminatingAt is set once the pod is requested to be killed - note that + // this can be set before the pod worker starts terminating the pod, see + // terminating. + terminatingAt time.Time + // startedTerminating is true once the pod worker has observed the request to + // stop a pod (exited syncPod and observed a podWork with WorkType + // TerminatingPodWork). Once this is set, it is safe for other components + // of the kubelet to assume that no other containers may be started. + startedTerminating bool + // deleted is true if the pod has been marked for deletion on the apiserver + // or has no configuration represented (was deleted before). + deleted bool + // gracePeriod is the requested gracePeriod once terminatingAt is nonzero. + gracePeriod int64 + // evicted is true if the kill indicated this was an eviction (an evicted + // pod can be more aggressively cleaned up). + evicted bool + // terminatedAt is set once the pod worker has completed a successful + // syncTerminatingPod call and means all running containers are stopped. + terminatedAt time.Time + // finished is true once the pod worker completes for a pod + // (syncTerminatedPod exited with no errors) until SyncKnownPods is invoked + // to remove the pod. A terminal pod (Succeeded/Failed) will have + // termination status until the pod is deleted. + finished bool + // notifyPostTerminating will be closed once the pod transitions to + // terminated. After the pod is in terminated state, nothing should be + // added to this list. + notifyPostTerminating []chan<- struct{} + // statusPostTerminating is a list of the status changes associated + // with kill pod requests. After the pod is in terminated state, nothing + // should be added to this list. The worker will execute the last function + // in this list on each termination attempt. + statusPostTerminating []PodStatusFunc +} + +func (s *podSyncStatus) IsWorking() bool { return s.working } +func (s *podSyncStatus) IsTerminationRequested() bool { return !s.terminatingAt.IsZero() } +func (s *podSyncStatus) IsTerminationStarted() bool { return s.startedTerminating } +func (s *podSyncStatus) IsTerminated() bool { return !s.terminatedAt.IsZero() } +func (s *podSyncStatus) IsFinished() bool { return s.finished } +func (s *podSyncStatus) IsEvicted() bool { return s.evicted } +func (s *podSyncStatus) IsDeleted() bool { return s.deleted } + +// podWorkers keeps track of operations on pods and ensures each pod is +// reconciled with the container runtime and other subsystems. The worker +// also tracks which pods are in flight for starting, which pods are +// shutting down but still have running containers, and which pods have +// terminated recently and are guaranteed to have no running containers. +// +// A pod passed to a pod worker is either being synced (expected to be +// running), terminating (has running containers but no new containers are +// expected to start), terminated (has no running containers but may still +// have resources being consumed), or cleaned up (no resources remaining). +// Once a pod is set to be "torn down" it cannot be started again for that +// UID (corresponding to a delete or eviction) until: +// +// 1. The pod worker is finalized (syncTerminatingPod and +// syncTerminatedPod exit without error sequentially) +// 2. The SyncKnownPods method is invoked by kubelet housekeeping and the pod +// is not part of the known config. +// +// Pod workers provide a consistent source of information to other kubelet +// loops about the status of the pod and whether containers can be +// running. The ShouldPodContentBeRemoved() method tracks whether a pod's +// contents should still exist, which includes non-existent pods after +// SyncKnownPods() has been called once (as per the contract, all existing +// pods should be provided via UpdatePod before SyncKnownPods is invoked). +// Generally other sync loops are expected to separate "setup" and +// "teardown" responsibilities and the information methods here assist in +// each by centralizing that state. A simple visualization of the time +// intervals involved might look like: +// +// ---| = kubelet config has synced at least once +// -------| |- = pod exists in apiserver config +// --------| |---------------- = CouldHaveRunningContainers() is true +// ^- pod is observed by pod worker . +// . . +// ----------| |------------------------- = syncPod is running +// . ^- pod worker loop sees change and invokes syncPod +// . . . +// --------------| |------- = ShouldPodContainersBeTerminating() returns true +// --------------| |------- = IsPodTerminationRequested() returns true (pod is known) +// . . ^- Kubelet evicts pod . +// . . . +// -------------------| |---------------- = syncTerminatingPod runs then exits without error +// . . ^ pod worker loop exits syncPod, sees pod is terminating, +// . . invokes syncTerminatingPod +// . . . +// ---| |------------------| . = ShouldPodRuntimeBeRemoved() returns true (post-sync) +// . ^ syncTerminatingPod has exited successfully +// . . +// ----------------------------| |------- = syncTerminatedPod runs then exits without error +// . ^ other loops can tear down +// . . +// ------------------------------------| |---- = status manager is waiting for PodResourcesAreReclaimed() +// . ^ . +// ----------| |- = status manager can be writing pod status +// ^ status manager deletes pod because no longer exists in config +// +// Other components in the Kubelet can request a termination of the pod +// via the UpdatePod method or the killPodNow wrapper - this will ensure +// the components of the pod are stopped until the kubelet is restarted +// or permanently (if the phase of the pod is set to a terminal phase +// in the pod status change). +// type podWorkers struct { // Protects all per worker fields. podLock sync.Mutex - + // podsSynced is true once the pod worker has been synced at least once, + // which means that all working pods have been started via UpdatePod(). + podsSynced bool // Tracks all running per-pod goroutines - per-pod goroutine will be // processing updates received through its corresponding channel. - podUpdates map[types.UID]chan UpdatePodOptions - // Track the current state of per-pod goroutines. - // Currently all update request for a given pod coming when another - // update of this pod is being processed are ignored. - isWorking map[types.UID]bool + podUpdates map[types.UID]chan podWork // Tracks the last undelivered work item for this pod - a work item is // undelivered if it comes in while the worker is working. - lastUndeliveredWorkUpdate map[types.UID]UpdatePodOptions + lastUndeliveredWorkUpdate map[types.UID]podWork + // Tracks by UID the termination status of a pod - syncing, terminating, + // terminated, and evicted. + podSyncStatuses map[types.UID]*podSyncStatus + // Tracks when a static pod is being killed and is removed when the + // static pod transitions to the killed state. + terminatingStaticPodFullnames map[string]struct{} workQueue queue.WorkQueue - // This function is run to sync the desired stated of pod. + // This function is run to sync the desired state of pod. // NOTE: This function has to be thread-safe - it can be called for // different pods at the same time. - syncPodFn syncPodFnType + + syncPodFn syncPodFnType + syncTerminatingPodFn syncTerminatingPodFnType + syncTerminatedPodFn syncTerminatedPodFnType // The EventRecorder to use recorder record.EventRecorder @@ -139,77 +369,236 @@ type podWorkers struct { podCache kubecontainer.Cache } -func newPodWorkers(syncPodFn syncPodFnType, recorder record.EventRecorder, workQueue queue.WorkQueue, - resyncInterval, backOffPeriod time.Duration, podCache kubecontainer.Cache) *podWorkers { +func newPodWorkers( + syncPodFn syncPodFnType, + syncTerminatingPodFn syncTerminatingPodFnType, + syncTerminatedPodFn syncTerminatedPodFnType, + recorder record.EventRecorder, + workQueue queue.WorkQueue, + resyncInterval, backOffPeriod time.Duration, + podCache kubecontainer.Cache, +) PodWorkers { return &podWorkers{ - podUpdates: map[types.UID]chan UpdatePodOptions{}, - isWorking: map[types.UID]bool{}, - lastUndeliveredWorkUpdate: map[types.UID]UpdatePodOptions{}, - syncPodFn: syncPodFn, - recorder: recorder, - workQueue: workQueue, - resyncInterval: resyncInterval, - backOffPeriod: backOffPeriod, - podCache: podCache, + podSyncStatuses: map[types.UID]*podSyncStatus{}, + podUpdates: map[types.UID]chan podWork{}, + lastUndeliveredWorkUpdate: map[types.UID]podWork{}, + terminatingStaticPodFullnames: map[string]struct{}{}, + syncPodFn: syncPodFn, + syncTerminatingPodFn: syncTerminatingPodFn, + syncTerminatedPodFn: syncTerminatedPodFn, + recorder: recorder, + workQueue: workQueue, + resyncInterval: resyncInterval, + backOffPeriod: backOffPeriod, + podCache: podCache, } } -func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) { - var lastSyncTime time.Time - for update := range podUpdates { - err := func() error { - podUID := update.Pod.UID - // This is a blocking call that would return only if the cache - // has an entry for the pod that is newer than minRuntimeCache - // Time. This ensures the worker doesn't start syncing until - // after the cache is at least newer than the finished time of - // the previous sync. - status, err := p.podCache.GetNewerThan(podUID, lastSyncTime) - if err != nil { - // This is the legacy event thrown by manage pod loop - // all other events are now dispatched from syncPodFn - p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) - return err - } - err = p.syncPodFn(syncPodOptions{ - mirrorPod: update.MirrorPod, - pod: update.Pod, - podStatus: status, - killPodOptions: update.KillPodOptions, - updateType: update.UpdateType, - }) - lastSyncTime = time.Now() - return err - }() - // notify the call-back function if the operation succeeded or not - if update.OnCompleteFunc != nil { - update.OnCompleteFunc(err) - } - if err != nil { - // IMPORTANT: we do not log errors here, the syncPodFn is responsible for logging errors - klog.ErrorS(err, "Error syncing pod, skipping", "pod", klog.KObj(update.Pod), "podUID", update.Pod.UID) - } - p.wrapUp(update.Pod.UID, err) +func (p *podWorkers) CouldHaveRunningContainers(uid types.UID) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + if status, ok := p.podSyncStatuses[uid]; ok { + return !status.IsTerminated() } + // once all pods are synced, any pod without sync status is known to not be running. + return !p.podsSynced } -// UpdatePod apply the new setting to the specified pod. -// If the options provide an OnCompleteFunc, the function is invoked if the update is accepted. -// Update requests are ignored if a kill pod request is pending. -func (p *podWorkers) UpdatePod(options *UpdatePodOptions) { +func (p *podWorkers) IsPodTerminationRequested(uid types.UID) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + if status, ok := p.podSyncStatuses[uid]; ok { + // the pod may still be setting up at this point. + return status.IsTerminationRequested() + } + // an unknown pod is considered not to be terminating (use ShouldPodContainersBeTerminating in + // cleanup loops to avoid failing to cleanup pods that have already been removed from config) + return false +} + +func (p *podWorkers) ShouldPodContainersBeTerminating(uid types.UID) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + if status, ok := p.podSyncStatuses[uid]; ok { + // we wait until the pod worker goroutine observes the termination, which means syncPod will not + // be executed again, which means no new containers can be started + return status.IsTerminationStarted() + } + // once we've synced, if the pod isn't known to the workers we should be tearing them + // down + return p.podsSynced +} + +func (p *podWorkers) ShouldPodRuntimeBeRemoved(uid types.UID) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + if status, ok := p.podSyncStatuses[uid]; ok { + return status.IsTerminated() + } + // a pod that hasn't been sent to the pod worker yet should have no runtime components once we have + // synced all content. + return p.podsSynced +} + +func (p *podWorkers) ShouldPodContentBeRemoved(uid types.UID) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + if status, ok := p.podSyncStatuses[uid]; ok { + return status.IsEvicted() || (status.IsDeleted() && status.IsTerminated()) + } + // a pod that hasn't been sent to the pod worker yet should have no content on disk once we have + // synced all content. + return p.podsSynced +} + +func (p *podWorkers) IsPodForMirrorPodTerminatingByFullName(podFullName string) bool { + p.podLock.Lock() + defer p.podLock.Unlock() + _, ok := p.terminatingStaticPodFullnames[podFullName] + return ok +} + +// UpdatePod carries a configuration change or termination state to a pod. A pod is either runnable, +// terminating, or terminated, and will transition to terminating if deleted on the apiserver, it is +// discovered to have a terminal phase (Succeeded or Failed), or if it is evicted by the kubelet. +func (p *podWorkers) UpdatePod(options UpdatePodOptions) { + // handle when the pod is an orphan (no config) and we only have runtime status by running only + // the terminating part of the lifecycle pod := options.Pod + var isRuntimePod bool + if options.RunningPod != nil { + if options.Pod == nil { + pod = options.RunningPod.ToAPIPod() + if options.UpdateType != kubetypes.SyncPodKill { + klog.InfoS("Pod update is ignored, runtime pods can only be killed", "pod", klog.KObj(pod), "podUID", pod.UID) + return + } + options.Pod = pod + isRuntimePod = true + } else { + options.RunningPod = nil + klog.InfoS("Pod update included RunningPod which is only valid when Pod is not specified", "pod", klog.KObj(options.Pod), "podUID", options.Pod.UID) + } + } uid := pod.UID - var podUpdates chan UpdatePodOptions - var exists bool p.podLock.Lock() defer p.podLock.Unlock() + + // decide what to do with this pod - we are either setting it up, tearing it down, or ignoring it + now := time.Now() + status, ok := p.podSyncStatuses[uid] + if !ok { + klog.V(4).InfoS("Pod is being synced for the first time", "pod", klog.KObj(pod), "podUID", pod.UID) + status = &podSyncStatus{ + syncedAt: now, + } + p.podSyncStatuses[uid] = status + } + + // once a pod is terminated by UID, it cannot reenter the pod worker (until the UID is purged by housekeeping) + if status.IsFinished() { + klog.V(4).InfoS("Pod is finished processing, no further updates", "pod", klog.KObj(pod), "podUID", pod.UID) + return + } + + // check for a transition to terminating + var becameTerminating bool + if !status.IsTerminationRequested() { + switch { + case isRuntimePod: + klog.V(4).InfoS("Pod is orphaned and must be torn down", "pod", klog.KObj(pod), "podUID", pod.UID) + status.deleted = true + status.terminatingAt = now + becameTerminating = true + case pod.DeletionTimestamp != nil: + klog.V(4).InfoS("Pod is marked for graceful deletion, begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + status.deleted = true + status.terminatingAt = now + becameTerminating = true + case pod.Status.Phase == v1.PodFailed, pod.Status.Phase == v1.PodSucceeded: + klog.V(4).InfoS("Pod is in a terminal phase (success/failed), begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + status.terminatingAt = now + becameTerminating = true + case options.UpdateType == kubetypes.SyncPodKill: + if options.KillPodOptions != nil && options.KillPodOptions.Evict { + klog.V(4).InfoS("Pod is being evicted by the kubelet, begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + status.evicted = true + } else { + klog.V(4).InfoS("Pod is being removed by the kubelet, begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + } + status.terminatingAt = now + becameTerminating = true + } + } + + // once a pod is terminating, all updates are kills and the grace period can only decrease + var workType PodWorkType + var wasGracePeriodShortened bool + switch { + case status.IsTerminated(): + workType = TerminatedPodWork + + if options.KillPodOptions != nil { + if ch := options.KillPodOptions.CompletedCh; ch != nil { + close(ch) + } + } + options.KillPodOptions = nil + + case status.IsTerminationRequested(): + workType = TerminatingPodWork + if options.KillPodOptions == nil { + options.KillPodOptions = &KillPodOptions{} + } + + if ch := options.KillPodOptions.CompletedCh; ch != nil { + status.notifyPostTerminating = append(status.notifyPostTerminating, ch) + } + if fn := options.KillPodOptions.PodStatusFunc; fn != nil { + status.statusPostTerminating = append(status.statusPostTerminating, fn) + } + + gracePeriod, gracePeriodShortened := calculateEffectiveGracePeriod(status, pod, options.KillPodOptions) + + wasGracePeriodShortened = gracePeriodShortened + status.gracePeriod = gracePeriod + // always set the grace period for syncTerminatingPod so we don't have to recalculate, + // will never be zero. + options.KillPodOptions.PodTerminationGracePeriodSecondsOverride = &gracePeriod + + // if a static pod comes through, start tracking it explicitly (cleared by the pod worker loop) + if kubelettypes.IsStaticPod(pod) { + p.terminatingStaticPodFullnames[kubecontainer.GetPodFullName(pod)] = struct{}{} + } + + default: + workType = SyncPodWork + + // KillPodOptions is not valid for sync actions outside of the terminating phase + if options.KillPodOptions != nil { + if ch := options.KillPodOptions.CompletedCh; ch != nil { + close(ch) + } + options.KillPodOptions = nil + } + } + + // the desired work we want to be performing + work := podWork{ + WorkType: workType, + Options: options, + } + + // start the pod worker goroutine if it doesn't exist + var podUpdates chan podWork + var exists bool if podUpdates, exists = p.podUpdates[uid]; !exists { // We need to have a buffer here, because checkForUpdates() method that // puts an update into channel is called from the same goroutine where // the channel is consumed. However, it is guaranteed that in such case // the channel is empty, so buffer of size 1 is enough. - podUpdates = make(chan UpdatePodOptions, 1) + podUpdates = make(chan podWork, 1) p.podUpdates[uid] = podUpdates // Creating a new pod worker either means this is a new pod, or that the @@ -221,76 +610,389 @@ func (p *podWorkers) UpdatePod(options *UpdatePodOptions) { p.managePodLoop(podUpdates) }() } - if !p.isWorking[pod.UID] { - p.isWorking[pod.UID] = true - podUpdates <- *options - } else { - // if a request to kill a pod is pending, we do not let anything overwrite that request. - update, found := p.lastUndeliveredWorkUpdate[pod.UID] - if !found || update.UpdateType != kubetypes.SyncPodKill { - p.lastUndeliveredWorkUpdate[pod.UID] = *options + + // dispatch a request to the pod worker if none are running + if !status.IsWorking() { + status.working = true + podUpdates <- work + return + } + + // capture the maximum latency between a requested update and when the pod + // worker observes it + if undelivered, ok := p.lastUndeliveredWorkUpdate[pod.UID]; ok { + // track the max latency between when a config change is requested and when it is realized + // NOTE: this undercounts the latency when multiple requests are queued, but captures max latency + if !undelivered.Options.StartTime.IsZero() && undelivered.Options.StartTime.Before(work.Options.StartTime) { + work.Options.StartTime = undelivered.Options.StartTime } } + + // always sync the most recent data + p.lastUndeliveredWorkUpdate[pod.UID] = work + + if (becameTerminating || wasGracePeriodShortened) && status.cancelFn != nil { + klog.V(3).InfoS("Cancelling current pod sync", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", work.WorkType) + status.cancelFn() + return + } } -func (p *podWorkers) removeWorker(uid types.UID) { - if ch, ok := p.podUpdates[uid]; ok { +// calculateEffectiveGracePeriod sets the initial grace period for a newly terminating pod or allows a +// shorter grace period to be provided, returning the desired value. +func calculateEffectiveGracePeriod(status *podSyncStatus, pod *v1.Pod, options *KillPodOptions) (int64, bool) { + // enforce the restriction that a grace period can only decrease and track whatever our value is, + // then ensure a calculated value is passed down to lower levels + gracePeriod := status.gracePeriod + // this value is bedrock truth - the apiserver owns telling us this value calculated by apiserver + if override := pod.DeletionGracePeriodSeconds; override != nil { + if gracePeriod == 0 || *override < gracePeriod { + gracePeriod = *override + } + } + // we allow other parts of the kubelet (namely eviction) to request this pod be terminated faster + if options != nil { + if override := options.PodTerminationGracePeriodSecondsOverride; override != nil { + if gracePeriod == 0 || *override < gracePeriod { + gracePeriod = *override + } + } + } + // make a best effort to default this value to the pod's desired intent, in the event + // the kubelet provided no requested value (graceful termination?) + if gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil { + gracePeriod = *pod.Spec.TerminationGracePeriodSeconds + } + // no matter what, we always supply a grace period of 1 + if gracePeriod < 1 { + gracePeriod = 1 + } + return gracePeriod, status.gracePeriod != 0 && status.gracePeriod != gracePeriod +} + +func (p *podWorkers) managePodLoop(podUpdates <-chan podWork) { + var lastSyncTime time.Time + for update := range podUpdates { + pod := update.Options.Pod + + klog.V(4).InfoS("Processing pod event", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + err := func() error { + // The worker is responsible for ensuring the sync method sees the appropriate + // status updates on resyncs (the result of the last sync), transitions to + // terminating (no wait), or on terminated (whatever the most recent state is). + // Only syncing and terminating can generate pod status changes, while terminated + // pods ensure the most recent status makes it to the api server. + var status *kubecontainer.PodStatus + var err error + switch { + case update.Options.RunningPod != nil: + // when we receive a running pod, we don't need status at all + default: + // wait until we see the next refresh from the PLEG via the cache (max 2s) + // TODO: this adds ~1s of latency on all transitions from sync to terminating + // to terminated, and on all termination retries (including evictions). We should + // improve latency by making the the pleg continuous and by allowing pod status + // changes to be refreshed when key events happen (killPod, sync->terminating). + // Improving this latency also reduces the possibility that a terminated + // container's status is garbage collected before we have a chance to update the + // API server (thus losing the exit code). + status, err = p.podCache.GetNewerThan(pod.UID, lastSyncTime) + } + if err != nil { + // This is the legacy event thrown by manage pod loop all other events are now dispatched + // from syncPodFn + p.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) + return err + } + + ctx := p.contextForWorker(pod.UID) + + // Take the appropriate action (illegal phases are prevented by UpdatePod) + switch { + case update.WorkType == TerminatedPodWork: + err = p.syncTerminatedPodFn(ctx, pod, status) + + case update.WorkType == TerminatingPodWork: + var gracePeriod *int64 + if opt := update.Options.KillPodOptions; opt != nil { + gracePeriod = opt.PodTerminationGracePeriodSecondsOverride + } + podStatusFn := p.acknowledgeTerminating(pod) + + err = p.syncTerminatingPodFn(ctx, pod, status, update.Options.RunningPod, gracePeriod, podStatusFn) + + default: + err = p.syncPodFn(ctx, update.Options.UpdateType, pod, update.Options.MirrorPod, status) + } + + lastSyncTime = time.Now() + return err + }() + + switch { + case err == context.Canceled: + // when the context is cancelled we expect an update to already be queued + klog.V(2).InfoS("Sync exited with context cancellation error", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + + case err != nil: + // we will queue a retry + klog.ErrorS(err, "Error syncing pod, skipping", "pod", klog.KObj(pod), "podUID", pod.UID) + + case update.WorkType == TerminatedPodWork: + // we can shut down the worker + p.completeTerminated(pod) + if start := update.Options.StartTime; !start.IsZero() { + metrics.PodWorkerDuration.WithLabelValues("terminated").Observe(metrics.SinceInSeconds(start)) + } + klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + return + + case update.WorkType == TerminatingPodWork: + // pods that don't exist in config don't need to be terminated, garbage collection will cover them + if update.Options.RunningPod != nil { + p.completeTerminatingRuntimePod(pod) + if start := update.Options.StartTime; !start.IsZero() { + metrics.PodWorkerDuration.WithLabelValues(update.Options.UpdateType.String()).Observe(metrics.SinceInSeconds(start)) + } + klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + return + } + // otherwise we move to the terminating phase + p.completeTerminating(pod) + } + + // queue a retry for errors if necessary, then put the next event in the channel if any + p.completeWork(pod, err) + if start := update.Options.StartTime; !start.IsZero() { + metrics.PodWorkerDuration.WithLabelValues(update.Options.UpdateType.String()).Observe(metrics.SinceInSeconds(start)) + } + klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + } +} + +// acknowledgeTerminating sets the terminating flag on the pod status once the pod worker sees +// the termination state so that other components know no new containers will be started in this +// pod. It then returns the status function, if any, that applies to this pod. +func (p *podWorkers) acknowledgeTerminating(pod *v1.Pod) PodStatusFunc { + p.podLock.Lock() + defer p.podLock.Unlock() + + status, ok := p.podSyncStatuses[pod.UID] + if !ok { + return nil + } + + if !status.terminatingAt.IsZero() && !status.startedTerminating { + klog.V(4).InfoS("Pod worker has observed request to terminate", "pod", klog.KObj(pod), "podUID", pod.UID) + status.startedTerminating = true + } + + if l := len(status.statusPostTerminating); l > 0 { + return status.statusPostTerminating[l-1] + } + return nil +} + +// completeTerminating is invoked when syncTerminatingPod completes successfully, which means +// no container is running, no container will be started in the future, and we are ready for +// cleanup. This updates the termination state which prevents future syncs and will ensure +// other kubelet loops know this pod is not running any containers. +func (p *podWorkers) completeTerminating(pod *v1.Pod) { + p.podLock.Lock() + defer p.podLock.Unlock() + + klog.V(4).InfoS("Pod terminated all containers successfully", "pod", klog.KObj(pod), "podUID", pod.UID) + + // if a static pod is being tracked, forget it + delete(p.terminatingStaticPodFullnames, kubecontainer.GetPodFullName(pod)) + + if status, ok := p.podSyncStatuses[pod.UID]; ok { + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) + } + status.terminatedAt = time.Now() + for _, ch := range status.notifyPostTerminating { + close(ch) + } + status.notifyPostTerminating = nil + status.statusPostTerminating = nil + } + + p.lastUndeliveredWorkUpdate[pod.UID] = podWork{ + WorkType: TerminatedPodWork, + Options: UpdatePodOptions{ + Pod: pod, + }, + } +} + +// completeTerminatingRuntimePod is invoked when syncTerminatingPod completes successfully, +// which means an orphaned pod (no config) is terminated and we can exit. Since orphaned +// pods have no API representation, we want to exit the loop at this point +// cleanup. This updates the termination state which prevents future syncs and will ensure +// other kubelet loops know this pod is not running any containers. +func (p *podWorkers) completeTerminatingRuntimePod(pod *v1.Pod) { + p.podLock.Lock() + defer p.podLock.Unlock() + + klog.V(4).InfoS("Pod terminated all orphaned containers successfully and worker can now stop", "pod", klog.KObj(pod), "podUID", pod.UID) + + // if a static pod is being tracked, forget it + delete(p.terminatingStaticPodFullnames, kubecontainer.GetPodFullName(pod)) + + if status, ok := p.podSyncStatuses[pod.UID]; ok { + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) + } + status.terminatedAt = time.Now() + status.finished = true + status.working = false + } + + ch, ok := p.podUpdates[pod.UID] + if ok { close(ch) - delete(p.podUpdates, uid) - // If there is an undelivered work update for this pod we need to remove it - // since per-pod goroutine won't be able to put it to the already closed - // channel when it finishes processing the current work update. - delete(p.lastUndeliveredWorkUpdate, uid) } -} -func (p *podWorkers) ForgetWorker(uid types.UID) { - p.podLock.Lock() - defer p.podLock.Unlock() - p.removeWorker(uid) + delete(p.podUpdates, pod.UID) + delete(p.lastUndeliveredWorkUpdate, pod.UID) + delete(p.terminatingStaticPodFullnames, kubecontainer.GetPodFullName(pod)) } -func (p *podWorkers) ForgetNonExistingPodWorkers(desiredPods map[types.UID]sets.Empty) { +// completeTerminated is invoked after syncTerminatedPod completes successfully and means we +// can stop the pod worker. The pod is finalized at this point. +func (p *podWorkers) completeTerminated(pod *v1.Pod) { p.podLock.Lock() defer p.podLock.Unlock() - for key := range p.podUpdates { - if _, exists := desiredPods[key]; !exists { - p.removeWorker(key) + + klog.V(4).InfoS("Pod is complete and the worker can now stop", "pod", klog.KObj(pod), "podUID", pod.UID) + + ch, ok := p.podUpdates[pod.UID] + if ok { + close(ch) + } + delete(p.podUpdates, pod.UID) + delete(p.lastUndeliveredWorkUpdate, pod.UID) + delete(p.terminatingStaticPodFullnames, kubecontainer.GetPodFullName(pod)) + + if status, ok := p.podSyncStatuses[pod.UID]; ok { + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) } + if status.terminatedAt.IsZero() { + klog.V(4).InfoS("Pod worker is complete but did not have terminatedAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) + } + status.finished = true + status.working = false } } -func (p *podWorkers) wrapUp(uid types.UID, syncErr error) { +// completeWork requeues on error or the next sync interval and then immediately executes any pending +// work. +func (p *podWorkers) completeWork(pod *v1.Pod, syncErr error) { // Requeue the last update if the last sync returned error. switch { case syncErr == nil: // No error; requeue at the regular resync interval. - p.workQueue.Enqueue(uid, wait.Jitter(p.resyncInterval, workerResyncIntervalJitterFactor)) + p.workQueue.Enqueue(pod.UID, wait.Jitter(p.resyncInterval, workerResyncIntervalJitterFactor)) case strings.Contains(syncErr.Error(), NetworkNotReadyErrorMsg): // Network is not ready; back off for short period of time and retry as network might be ready soon. - p.workQueue.Enqueue(uid, wait.Jitter(backOffOnTransientErrorPeriod, workerBackOffPeriodJitterFactor)) + p.workQueue.Enqueue(pod.UID, wait.Jitter(backOffOnTransientErrorPeriod, workerBackOffPeriodJitterFactor)) default: // Error occurred during the sync; back off and then retry. - p.workQueue.Enqueue(uid, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) + p.workQueue.Enqueue(pod.UID, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) } - p.checkForUpdates(uid) + p.completeWorkQueueNext(pod.UID) } -func (p *podWorkers) checkForUpdates(uid types.UID) { +// completeWorkQueueNext holds the lock and either queues the next work item for the worker or +// clears the working status. +func (p *podWorkers) completeWorkQueueNext(uid types.UID) { p.podLock.Lock() defer p.podLock.Unlock() if workUpdate, exists := p.lastUndeliveredWorkUpdate[uid]; exists { p.podUpdates[uid] <- workUpdate delete(p.lastUndeliveredWorkUpdate, uid) } else { - // put this line in removeWorker may cause dead lock, so keep reset it here - delete(p.isWorking, uid) + p.podSyncStatuses[uid].working = false } } +// contextForWorker returns or initializes the appropriate context for a known +// worker. If the current context is expired, it is reset. If no worker is +// present, no context is returned. +func (p *podWorkers) contextForWorker(uid types.UID) context.Context { + p.podLock.Lock() + defer p.podLock.Unlock() + + status, ok := p.podSyncStatuses[uid] + if !ok { + return nil + } + if status.ctx == nil || status.ctx.Err() == context.Canceled { + status.ctx, status.cancelFn = context.WithCancel(context.Background()) + } + return status.ctx +} + +// SyncKnownPods will purge any fully terminated pods that are not in the desiredPods +// list, which means SyncKnownPods must be called in a threadsafe manner from calls +// to UpdatePods for new pods. It returns a map of known workers that are not finished +// with a value of SyncPodTerminated, SyncPodKill, or SyncPodSync depending on whether +// the pod is terminated, terminating, or syncing. +func (p *podWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkType { + workers := make(map[types.UID]PodWorkType) + known := make(map[types.UID]struct{}) + for _, pod := range desiredPods { + known[pod.UID] = struct{}{} + } + + p.podLock.Lock() + defer p.podLock.Unlock() + + p.podsSynced = true + for uid, status := range p.podSyncStatuses { + if _, exists := known[uid]; !exists { + p.removeTerminatedWorker(uid) + } + switch { + case !status.terminatedAt.IsZero(): + workers[uid] = TerminatedPodWork + case !status.terminatingAt.IsZero(): + workers[uid] = TerminatingPodWork + default: + workers[uid] = SyncPodWork + } + } + return workers +} + +// removeTerminatedWorker cleans up and removes the worker status for a worker that +// has reached a terminal state of "finished" - has successfully exited +// syncTerminatedPod. This "forgets" a pod by UID and allows another pod to be recreated +// with the same UID. +func (p *podWorkers) removeTerminatedWorker(uid types.UID) { + status, ok := p.podSyncStatuses[uid] + if !ok { + // already forgotten, or forgotten too early + klog.V(4).InfoS("Pod worker has been requested for removal but is not a known pod", "podUID", uid) + return + } + + if !status.finished { + klog.V(4).InfoS("Pod worker has been requested for removal but is still not fully terminated", "podUID", uid) + return + } + + klog.V(4).InfoS("Pod has been terminated and is no longer known to the kubelet, remove all history", "podUID", uid) + delete(p.podSyncStatuses, uid) + delete(p.podUpdates, uid) + delete(p.lastUndeliveredWorkUpdate, uid) +} + // killPodNow returns a KillPodFunc that can be used to kill a pod. // It is intended to be injected into other modules that need to kill a pod. func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.KillPodFunc { - return func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { + return func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, statusFn func(*v1.PodStatus)) error { // determine the grace period to use when killing the pod gracePeriod := int64(0) if gracePeriodOverride != nil { @@ -309,28 +1011,22 @@ func killPodNow(podWorkers PodWorkers, recorder record.EventRecorder) eviction.K timeoutDuration := time.Duration(timeout) * time.Second // open a channel we block against until we get a result - type response struct { - err error - } - ch := make(chan response, 1) - podWorkers.UpdatePod(&UpdatePodOptions{ + ch := make(chan struct{}, 1) + podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodKill, - OnCompleteFunc: func(err error) { - ch <- response{err: err} - }, KillPodOptions: &KillPodOptions{ - PodStatusFunc: func(p *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus { - return status - }, + CompletedCh: ch, + Evict: isEvicted, + PodStatusFunc: statusFn, PodTerminationGracePeriodSecondsOverride: gracePeriodOverride, }, }) // wait for either a response, or a timeout select { - case r := <-ch: - return r.err + case <-ch: + return nil case <-time.After(timeoutDuration): recorder.Eventf(pod, v1.EventTypeWarning, events.ExceededGracePeriod, "Container runtime did not kill the pod within specified grace period.") return fmt.Errorf("timeout waiting to kill pod") diff --git a/pkg/kubelet/pod_workers_test.go b/pkg/kubelet/pod_workers_test.go index 59eaae48fe1..a23daa8ca6b 100644 --- a/pkg/kubelet/pod_workers_test.go +++ b/pkg/kubelet/pod_workers_test.go @@ -17,6 +17,7 @@ limitations under the License. package kubelet import ( + "context" "reflect" "strconv" "sync" @@ -38,30 +39,82 @@ import ( // fakePodWorkers runs sync pod function in serial, so we can have // deterministic behaviour in testing. type fakePodWorkers struct { + lock sync.Mutex syncPodFn syncPodFnType cache kubecontainer.Cache t TestingInterface + + triggeredDeletion []types.UID + + statusLock sync.Mutex + running map[types.UID]bool + terminating map[types.UID]bool + terminationRequested map[types.UID]bool + removeRuntime map[types.UID]bool + removeContent map[types.UID]bool + terminatingStaticPods map[string]bool } -func (f *fakePodWorkers) UpdatePod(options *UpdatePodOptions) { - status, err := f.cache.Get(options.Pod.UID) +func (f *fakePodWorkers) UpdatePod(options UpdatePodOptions) { + f.lock.Lock() + defer f.lock.Unlock() + var uid types.UID + switch { + case options.Pod != nil: + uid = options.Pod.UID + case options.RunningPod != nil: + uid = options.RunningPod.ID + default: + return + } + status, err := f.cache.Get(uid) if err != nil { f.t.Errorf("Unexpected error: %v", err) } - if err := f.syncPodFn(syncPodOptions{ - mirrorPod: options.MirrorPod, - pod: options.Pod, - podStatus: status, - updateType: options.UpdateType, - killPodOptions: options.KillPodOptions, - }); err != nil { - f.t.Errorf("Unexpected error: %v", err) + switch options.UpdateType { + case kubetypes.SyncPodKill: + f.triggeredDeletion = append(f.triggeredDeletion, uid) + default: + if err := f.syncPodFn(context.Background(), options.UpdateType, options.Pod, options.MirrorPod, status); err != nil { + f.t.Errorf("Unexpected error: %v", err) + } } } -func (f *fakePodWorkers) ForgetNonExistingPodWorkers(desiredPods map[types.UID]sets.Empty) {} +func (f *fakePodWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkType { + return nil +} -func (f *fakePodWorkers) ForgetWorker(uid types.UID) {} +func (f *fakePodWorkers) CouldHaveRunningContainers(uid types.UID) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.running[uid] +} +func (f *fakePodWorkers) IsPodTerminationRequested(uid types.UID) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.terminationRequested[uid] +} +func (f *fakePodWorkers) ShouldPodContainersBeTerminating(uid types.UID) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.terminating[uid] +} +func (f *fakePodWorkers) ShouldPodRuntimeBeRemoved(uid types.UID) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.removeRuntime[uid] +} +func (f *fakePodWorkers) ShouldPodContentBeRemoved(uid types.UID) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.removeContent[uid] +} +func (f *fakePodWorkers) IsPodForMirrorPodTerminatingByFullName(podFullname string) bool { + f.statusLock.Lock() + defer f.statusLock.Unlock() + return f.terminatingStaticPods[podFullname] +} type TestingInterface interface { Errorf(format string, args ...interface{}) @@ -80,6 +133,8 @@ func newPod(uid, name string) *v1.Pod { type syncPodRecord struct { name string updateType kubetypes.SyncPodType + runningPod *kubecontainer.Pod + terminated bool } func createPodWorkers() (*podWorkers, map[types.UID][]syncPodRecord) { @@ -88,15 +143,38 @@ func createPodWorkers() (*podWorkers, map[types.UID][]syncPodRecord) { fakeRecorder := &record.FakeRecorder{} fakeRuntime := &containertest.FakeRuntime{} fakeCache := containertest.NewFakeCache(fakeRuntime) - podWorkers := newPodWorkers( - func(options syncPodOptions) error { + w := newPodWorkers( + func(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error { func() { lock.Lock() defer lock.Unlock() - pod := options.pod + pod := pod processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ name: pod.Name, - updateType: options.updateType, + updateType: updateType, + }) + }() + return nil + }, + func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { + func() { + lock.Lock() + defer lock.Unlock() + processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ + name: pod.Name, + updateType: kubetypes.SyncPodKill, + runningPod: runningPod, + }) + }() + return nil + }, + func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + func() { + lock.Lock() + defer lock.Unlock() + processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ + name: pod.Name, + terminated: true, }) }() return nil @@ -107,7 +185,7 @@ func createPodWorkers() (*podWorkers, map[types.UID][]syncPodRecord) { time.Second, fakeCache, ) - return podWorkers, processed + return w.(*podWorkers), processed } func drainWorkers(podWorkers *podWorkers, numPods int) { @@ -115,8 +193,27 @@ func drainWorkers(podWorkers *podWorkers, numPods int) { stillWorking := false podWorkers.podLock.Lock() for i := 0; i < numPods; i++ { - if podWorkers.isWorking[types.UID(strconv.Itoa(i))] { + if s, ok := podWorkers.podSyncStatuses[types.UID(strconv.Itoa(i))]; ok && s.working { stillWorking = true + break + } + } + podWorkers.podLock.Unlock() + if !stillWorking { + break + } + time.Sleep(50 * time.Millisecond) + } +} + +func drainAllWorkers(podWorkers *podWorkers) { + for { + stillWorking := false + podWorkers.podLock.Lock() + for _, worker := range podWorkers.podSyncStatuses { + if worker.working { + stillWorking = true + break } } podWorkers.podLock.Unlock() @@ -133,7 +230,7 @@ func TestUpdatePod(t *testing.T) { numPods := 20 for i := 0; i < numPods; i++ { for j := i; j < numPods; j++ { - podWorkers.UpdatePod(&UpdatePodOptions{ + podWorkers.UpdatePod(UpdatePodOptions{ Pod: newPod(strconv.Itoa(j), strconv.Itoa(i)), UpdateType: kubetypes.SyncPodCreate, }) @@ -165,20 +262,51 @@ func TestUpdatePod(t *testing.T) { } } +func TestUpdatePodForRuntimePod(t *testing.T) { + podWorkers, processed := createPodWorkers() + + // ignores running pod of wrong sync type + podWorkers.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + RunningPod: &kubecontainer.Pod{ID: "1", Name: "1", Namespace: "test"}, + }) + drainAllWorkers(podWorkers) + if len(processed) != 0 { + t.Fatalf("Not all pods processed: %v", len(processed)) + } + + // creates synthetic pod + podWorkers.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + RunningPod: &kubecontainer.Pod{ID: "1", Name: "1", Namespace: "test"}, + }) + drainAllWorkers(podWorkers) + if len(processed) != 1 { + t.Fatalf("Not all pods processed: %v", processed) + } + updates := processed["1"] + if len(updates) != 1 { + t.Fatalf("unexpected updates: %v", updates) + } + if updates[0].runningPod == nil || updates[0].updateType != kubetypes.SyncPodKill || updates[0].name != "1" { + t.Fatalf("unexpected update: %v", updates) + } +} + func TestUpdatePodDoesNotForgetSyncPodKill(t *testing.T) { podWorkers, processed := createPodWorkers() numPods := 20 for i := 0; i < numPods; i++ { pod := newPod(strconv.Itoa(i), strconv.Itoa(i)) - podWorkers.UpdatePod(&UpdatePodOptions{ + podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodCreate, }) - podWorkers.UpdatePod(&UpdatePodOptions{ + podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodKill, }) - podWorkers.UpdatePod(&UpdatePodOptions{ + podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodUpdate, }) @@ -205,12 +333,12 @@ func TestUpdatePodDoesNotForgetSyncPodKill(t *testing.T) { } } -func TestForgetNonExistingPodWorkers(t *testing.T) { +func TestSyncKnownPods(t *testing.T) { podWorkers, _ := createPodWorkers() numPods := 20 for i := 0; i < numPods; i++ { - podWorkers.UpdatePod(&UpdatePodOptions{ + podWorkers.UpdatePod(UpdatePodOptions{ Pod: newPod(strconv.Itoa(i), "name"), UpdateType: kubetypes.SyncPodUpdate, }) @@ -224,7 +352,72 @@ func TestForgetNonExistingPodWorkers(t *testing.T) { desiredPods := map[types.UID]sets.Empty{} desiredPods[types.UID("2")] = sets.Empty{} desiredPods[types.UID("14")] = sets.Empty{} - podWorkers.ForgetNonExistingPodWorkers(desiredPods) + desiredPodList := []*v1.Pod{newPod("2", "name"), newPod("14", "name")} + + // kill all but the requested pods + for i := 0; i < numPods; i++ { + pod := newPod(strconv.Itoa(i), "name") + if _, ok := desiredPods[pod.UID]; ok { + continue + } + if (i % 2) == 0 { + now := metav1.Now() + pod.DeletionTimestamp = &now + } + podWorkers.UpdatePod(UpdatePodOptions{ + Pod: pod, + UpdateType: kubetypes.SyncPodKill, + }) + } + drainWorkers(podWorkers, numPods) + + if !podWorkers.ShouldPodContainersBeTerminating(types.UID("0")) { + t.Errorf("Expected pod to be terminating") + } + if !podWorkers.ShouldPodContainersBeTerminating(types.UID("1")) { + t.Errorf("Expected pod to be terminating") + } + if podWorkers.ShouldPodContainersBeTerminating(types.UID("2")) { + t.Errorf("Expected pod to not be terminating") + } + if !podWorkers.IsPodTerminationRequested(types.UID("0")) { + t.Errorf("Expected pod to be terminating") + } + if podWorkers.IsPodTerminationRequested(types.UID("2")) { + t.Errorf("Expected pod to not be terminating") + } + + if podWorkers.CouldHaveRunningContainers(types.UID("0")) { + t.Errorf("Expected pod to be terminated (deleted and terminated)") + } + if podWorkers.CouldHaveRunningContainers(types.UID("1")) { + t.Errorf("Expected pod to be terminated") + } + if !podWorkers.CouldHaveRunningContainers(types.UID("2")) { + t.Errorf("Expected pod to not be terminated") + } + + if !podWorkers.ShouldPodContentBeRemoved(types.UID("0")) { + t.Errorf("Expected pod to be suitable for removal (deleted and terminated)") + } + if podWorkers.ShouldPodContentBeRemoved(types.UID("1")) { + t.Errorf("Expected pod to not be suitable for removal (terminated but not deleted)") + } + if podWorkers.ShouldPodContentBeRemoved(types.UID("2")) { + t.Errorf("Expected pod to not be suitable for removal (not terminated)") + } + + if podWorkers.ShouldPodContainersBeTerminating(types.UID("abc")) { + t.Errorf("Expected pod to not be known to be terminating (does not exist but not yet synced)") + } + if !podWorkers.CouldHaveRunningContainers(types.UID("abc")) { + t.Errorf("Expected pod to potentially have running containers (does not exist but not yet synced)") + } + if podWorkers.ShouldPodContentBeRemoved(types.UID("abc")) { + t.Errorf("Expected pod to not be suitable for removal (does not exist but not yet synced)") + } + + podWorkers.SyncKnownPods(desiredPodList) if len(podWorkers.podUpdates) != 2 { t.Errorf("Incorrect number of open channels %v", len(podWorkers.podUpdates)) } @@ -234,34 +427,52 @@ func TestForgetNonExistingPodWorkers(t *testing.T) { if _, exists := podWorkers.podUpdates[types.UID("14")]; !exists { t.Errorf("No updates channel for pod 14") } + if podWorkers.IsPodTerminationRequested(types.UID("2")) { + t.Errorf("Expected pod termination request to be cleared after sync") + } - podWorkers.ForgetNonExistingPodWorkers(map[types.UID]sets.Empty{}) - if len(podWorkers.podUpdates) != 0 { + if !podWorkers.ShouldPodContainersBeTerminating(types.UID("abc")) { + t.Errorf("Expected pod to be expected to terminate containers (does not exist and synced at least once)") + } + if podWorkers.CouldHaveRunningContainers(types.UID("abc")) { + t.Errorf("Expected pod to be known not to have running containers (does not exist and synced at least once)") + } + if !podWorkers.ShouldPodContentBeRemoved(types.UID("abc")) { + t.Errorf("Expected pod to be suitable for removal (does not exist and synced at least once)") + } + + // verify workers that are not terminated stay open even if config no longer + // sees them + podWorkers.SyncKnownPods(nil) + if len(podWorkers.podUpdates) != 2 { t.Errorf("Incorrect number of open channels %v", len(podWorkers.podUpdates)) } -} + if len(podWorkers.podSyncStatuses) != 2 { + t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.podSyncStatuses) + } + if len(podWorkers.lastUndeliveredWorkUpdate) != 0 { + t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.lastUndeliveredWorkUpdate) + } -func TestIsWorkingClearedAfterForgetPodWorkers(t *testing.T) { - podWorkers, _ := createPodWorkers() - - numPods := 20 - for i := 0; i < numPods; i++ { - podWorkers.UpdatePod(&UpdatePodOptions{ - Pod: newPod(strconv.Itoa(i), "name"), - UpdateType: kubetypes.SyncPodUpdate, + for uid := range desiredPods { + pod := newPod(string(uid), "name") + podWorkers.UpdatePod(UpdatePodOptions{ + Pod: pod, + UpdateType: kubetypes.SyncPodKill, }) } drainWorkers(podWorkers, numPods) - if len(podWorkers.podUpdates) != numPods { - t.Errorf("Incorrect number of open channels %v", len(podWorkers.podUpdates)) - } - podWorkers.ForgetNonExistingPodWorkers(map[types.UID]sets.Empty{}) + // verify once those pods terminate (via some other flow) the workers are cleared + podWorkers.SyncKnownPods(nil) if len(podWorkers.podUpdates) != 0 { t.Errorf("Incorrect number of open channels %v", len(podWorkers.podUpdates)) } - if len(podWorkers.isWorking) != 0 { - t.Errorf("Incorrect number of isWorking %v", len(podWorkers.isWorking)) + if len(podWorkers.podSyncStatuses) != 0 { + t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.podSyncStatuses) + } + if len(podWorkers.lastUndeliveredWorkUpdate) != 0 { + t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.lastUndeliveredWorkUpdate) } } @@ -272,17 +483,25 @@ type simpleFakeKubelet struct { wg sync.WaitGroup } -func (kl *simpleFakeKubelet) syncPod(options syncPodOptions) error { - kl.pod, kl.mirrorPod, kl.podStatus = options.pod, options.mirrorPod, options.podStatus +func (kl *simpleFakeKubelet) syncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + kl.pod, kl.mirrorPod, kl.podStatus = pod, mirrorPod, podStatus return nil } -func (kl *simpleFakeKubelet) syncPodWithWaitGroup(options syncPodOptions) error { - kl.pod, kl.mirrorPod, kl.podStatus = options.pod, options.mirrorPod, options.podStatus +func (kl *simpleFakeKubelet) syncPodWithWaitGroup(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + kl.pod, kl.mirrorPod, kl.podStatus = pod, mirrorPod, podStatus kl.wg.Done() return nil } +func (kl *simpleFakeKubelet) syncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { + return nil +} + +func (kl *simpleFakeKubelet) syncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + return nil +} + // TestFakePodWorkers verifies that the fakePodWorkers behaves the same way as the real podWorkers // for their invocation of the syncPodFn. func TestFakePodWorkers(t *testing.T) { @@ -293,8 +512,16 @@ func TestFakePodWorkers(t *testing.T) { kubeletForRealWorkers := &simpleFakeKubelet{} kubeletForFakeWorkers := &simpleFakeKubelet{} - realPodWorkers := newPodWorkers(kubeletForRealWorkers.syncPodWithWaitGroup, fakeRecorder, queue.NewBasicWorkQueue(&clock.RealClock{}), time.Second, time.Second, fakeCache) - fakePodWorkers := &fakePodWorkers{kubeletForFakeWorkers.syncPod, fakeCache, t} + realPodWorkers := newPodWorkers( + kubeletForRealWorkers.syncPodWithWaitGroup, + kubeletForRealWorkers.syncTerminatingPod, + kubeletForRealWorkers.syncTerminatedPod, + fakeRecorder, queue.NewBasicWorkQueue(&clock.RealClock{}), time.Second, time.Second, fakeCache) + fakePodWorkers := &fakePodWorkers{ + syncPodFn: kubeletForFakeWorkers.syncPod, + cache: fakeCache, + t: t, + } tests := []struct { pod *v1.Pod @@ -316,12 +543,12 @@ func TestFakePodWorkers(t *testing.T) { for i, tt := range tests { kubeletForRealWorkers.wg.Add(1) - realPodWorkers.UpdatePod(&UpdatePodOptions{ + realPodWorkers.UpdatePod(UpdatePodOptions{ Pod: tt.pod, MirrorPod: tt.mirrorPod, UpdateType: kubetypes.SyncPodUpdate, }) - fakePodWorkers.UpdatePod(&UpdatePodOptions{ + fakePodWorkers.UpdatePod(UpdatePodOptions{ Pod: tt.pod, MirrorPod: tt.mirrorPod, UpdateType: kubetypes.SyncPodUpdate, @@ -350,19 +577,26 @@ func TestKillPodNowFunc(t *testing.T) { killPodFunc := killPodNow(podWorkers, fakeRecorder) pod := newPod("test", "test") gracePeriodOverride := int64(0) - err := killPodFunc(pod, v1.PodStatus{Phase: v1.PodFailed, Reason: "reason", Message: "message"}, &gracePeriodOverride) + err := killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) { + status.Phase = v1.PodFailed + status.Reason = "reason" + status.Message = "message" + }) if err != nil { - t.Errorf("Unexpected error: %v", err) + t.Fatalf("Unexpected error: %v", err) } + drainAllWorkers(podWorkers) if len(processed) != 1 { - t.Errorf("len(processed) expected: %v, actual: %v", 1, len(processed)) - return + t.Fatalf("len(processed) expected: %v, actual: %#v", 1, processed) } syncPodRecords := processed[pod.UID] - if len(syncPodRecords) != 1 { - t.Errorf("Pod processed %v times, but expected %v", len(syncPodRecords), 1) + if len(syncPodRecords) != 2 { + t.Fatalf("Pod processed expected %v times, got %#v", 1, syncPodRecords) } if syncPodRecords[0].updateType != kubetypes.SyncPodKill { t.Errorf("Pod update type was %v, but expected %v", syncPodRecords[0].updateType, kubetypes.SyncPodKill) } + if !syncPodRecords[1].terminated { + t.Errorf("Pod terminated %v, but expected %v", syncPodRecords[1].terminated, true) + } } diff --git a/pkg/kubelet/preemption/preemption.go b/pkg/kubelet/preemption/preemption.go index 89ee669dcc0..ca1b1770f9d 100644 --- a/pkg/kubelet/preemption/preemption.go +++ b/pkg/kubelet/preemption/preemption.go @@ -20,7 +20,7 @@ import ( "fmt" "math" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/api/v1/resource" @@ -94,18 +94,16 @@ func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod, if err != nil { return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err) } - klog.InfoS("Attempting to evict pods in order to free up resources", "pods", podsToPreempt, - "insufficientResources", insufficientResources.toString()) for _, pod := range podsToPreempt { - status := v1.PodStatus{ - Phase: v1.PodFailed, - Message: message, - Reason: events.PreemptContainer, - } // record that we are evicting the pod c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message) // this is a blocking call and should only return when the pod and its containers are killed. - err := c.killPodFunc(pod, status, nil) + klog.V(3).InfoS("Preempting pod to free up resources", "pod", klog.KObj(pod), "podUID", pod.UID, "insufficientResources", insufficientResources.toString()) + err := c.killPodFunc(pod, true, nil, func(status *v1.PodStatus) { + status.Phase = v1.PodFailed + status.Reason = events.PreemptContainer + status.Message = message + }) if err != nil { klog.ErrorS(err, "Failed to evict pod", "pod", klog.KObj(pod)) // In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on. diff --git a/pkg/kubelet/preemption/preemption_test.go b/pkg/kubelet/preemption/preemption_test.go index 00a0c2e5475..f643dd4a936 100644 --- a/pkg/kubelet/preemption/preemption_test.go +++ b/pkg/kubelet/preemption/preemption_test.go @@ -20,7 +20,7 @@ import ( "fmt" "testing" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" @@ -57,7 +57,7 @@ func (f *fakePodKiller) getKilledPods() []*v1.Pod { return f.killedPods } -func (f *fakePodKiller) killPodNow(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { +func (f *fakePodKiller) killPodNow(pod *v1.Pod, evict bool, gracePeriodOverride *int64, fn func(status *v1.PodStatus)) error { if f.errDuringPodKilling { f.killedPods = []*v1.Pod{} return fmt.Errorf("problem killing pod %v", pod) diff --git a/pkg/kubelet/runonce.go b/pkg/kubelet/runonce.go index 3272bccfda7..19b8a4f6a7b 100644 --- a/pkg/kubelet/runonce.go +++ b/pkg/kubelet/runonce.go @@ -17,11 +17,12 @@ limitations under the License. package kubelet import ( + "context" "fmt" "os" "time" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" @@ -130,12 +131,7 @@ func (kl *Kubelet) runPod(pod *v1.Pod, retryDelay time.Duration) error { klog.ErrorS(err, "Failed creating a mirror pod", "pod", klog.KObj(pod)) } mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) - if err = kl.syncPod(syncPodOptions{ - pod: pod, - mirrorPod: mirrorPod, - podStatus: status, - updateType: kubetypes.SyncPodUpdate, - }); err != nil { + if err = kl.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, mirrorPod, status); err != nil { return fmt.Errorf("error syncing pod %q: %v", format.Pod(pod), err) } if retry >= runOnceMaxRetries { diff --git a/pkg/kubelet/runonce_test.go b/pkg/kubelet/runonce_test.go index 75f695560d0..3a18e3d6c95 100644 --- a/pkg/kubelet/runonce_test.go +++ b/pkg/kubelet/runonce_test.go @@ -79,6 +79,7 @@ func TestRunOnce(t *testing.T) { nodeLister: testNodeLister{}, statusManager: status.NewManager(nil, podManager, &statustest.FakePodDeletionSafetyProvider{}), podManager: podManager, + podWorkers: &fakePodWorkers{}, os: &containertest.FakeOS{}, containerRuntime: fakeRuntime, reasonCache: NewReasonCache(), @@ -101,7 +102,7 @@ func TestRunOnce(t *testing.T) { true, kb.nodeName, kb.podManager, - kb.statusManager, + kb.podWorkers, kb.kubeClient, kb.volumePluginMgr, fakeRuntime, @@ -122,7 +123,7 @@ func TestRunOnce(t *testing.T) { UID: types.UID(kb.nodeName), Namespace: "", } - fakeKillPodFunc := func(pod *v1.Pod, podStatus v1.PodStatus, gracePeriodOverride *int64) error { + fakeKillPodFunc := func(pod *v1.Pod, evict bool, gracePeriodOverride *int64, fn func(*v1.PodStatus)) error { return nil } fakeMirrodPodFunc := func(*v1.Pod) (*v1.Pod, bool) { return nil, false } diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go index c8ee37a5222..f61fd526293 100644 --- a/pkg/kubelet/status/status_manager.go +++ b/pkg/kubelet/status/status_manager.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sort" + "strings" "sync" "time" @@ -356,6 +357,7 @@ func (m *manager) TerminatePod(pod *v1.Pod) { } } + klog.V(5).InfoS("TerminatePod calling updateStatusInternal", "pod", klog.KObj(pod), "podUID", pod.UID) m.updateStatusInternal(pod, status, true) } @@ -431,10 +433,43 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp } normalizeStatus(pod, &status) + + // Perform some more extensive logging of container termination state to assist in + // debugging production races (generally not needed). + if klog.V(5).Enabled() { + var containers []string + for _, s := range append(append([]v1.ContainerStatus(nil), status.InitContainerStatuses...), status.ContainerStatuses...) { + var current, previous string + switch { + case s.State.Running != nil: + current = "running" + case s.State.Waiting != nil: + current = "waiting" + case s.State.Terminated != nil: + current = fmt.Sprintf("terminated=%d", s.State.Terminated.ExitCode) + default: + current = "unknown" + } + switch { + case s.LastTerminationState.Running != nil: + previous = "running" + case s.LastTerminationState.Waiting != nil: + previous = "waiting" + case s.LastTerminationState.Terminated != nil: + previous = fmt.Sprintf("terminated=%d", s.LastTerminationState.Terminated.ExitCode) + default: + previous = "" + } + containers = append(containers, fmt.Sprintf("(%s state=%s previous=%s)", s.Name, current, previous)) + } + sort.Strings(containers) + klog.InfoS("updateStatusInternal", "version", cachedStatus.version+1, "pod", klog.KObj(pod), "podUID", pod.UID, "containers", strings.Join(containers, " ")) + } + // The intent here is to prevent concurrent updates to a pod's status from // clobbering each other so the phase of a pod progresses monotonically. if isCached && isPodStatusByKubeletEqual(&cachedStatus.status, &status) && !forceUpdate { - klog.V(5).InfoS("Ignoring same status for pod", "pod", klog.KObj(pod), "status", status) + klog.V(3).InfoS("Ignoring same status for pod", "pod", klog.KObj(pod), "status", status) return false // No new status. } @@ -583,7 +618,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) { oldStatus := pod.Status.DeepCopy() newPod, patchBytes, unchanged, err := statusutil.PatchPodStatus(m.kubeClient, pod.Namespace, pod.Name, pod.UID, *oldStatus, mergePodStatus(*oldStatus, status.status)) - klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "patchBytes", patchBytes) + klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "patch", string(patchBytes)) if err != nil { klog.InfoS("Failed to update status for pod", "pod", klog.KObj(pod), "err", err) diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index 9420eef1eb3..81abf6dcb19 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -120,8 +120,8 @@ const ( SyncPodUpdate // SyncPodCreate is when the pod is created from source SyncPodCreate - // SyncPodKill is when the pod is killed based on a trigger internal to the kubelet for eviction. - // If a SyncPodKill request is made to pod workers, the request is never dropped, and will always be processed. + // SyncPodKill is when the pod should have no running containers. A pod stopped in this way could be + // restarted in the future due config changes. SyncPodKill ) diff --git a/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator.go b/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator.go index f9c4f46f312..d9f6c1026b6 100644 --- a/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator.go +++ b/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator.go @@ -40,7 +40,6 @@ import ( "k8s.io/kubernetes/pkg/kubelet/config" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/pod" - "k8s.io/kubernetes/pkg/kubelet/status" "k8s.io/kubernetes/pkg/kubelet/volumemanager/cache" "k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume/csimigration" @@ -70,6 +69,12 @@ type DesiredStateOfWorldPopulator interface { HasAddedPods() bool } +// podStateProvider can determine if a pod is going to be terminated. +type podStateProvider interface { + ShouldPodContainersBeTerminating(types.UID) bool + ShouldPodRuntimeBeRemoved(types.UID) bool +} + // NewDesiredStateOfWorldPopulator returns a new instance of // DesiredStateOfWorldPopulator. // @@ -84,7 +89,7 @@ func NewDesiredStateOfWorldPopulator( loopSleepDuration time.Duration, getPodStatusRetryDuration time.Duration, podManager pod.Manager, - podStatusProvider status.PodStatusProvider, + podStateProvider podStateProvider, desiredStateOfWorld cache.DesiredStateOfWorld, actualStateOfWorld cache.ActualStateOfWorld, kubeContainerRuntime kubecontainer.Runtime, @@ -97,7 +102,7 @@ func NewDesiredStateOfWorldPopulator( loopSleepDuration: loopSleepDuration, getPodStatusRetryDuration: getPodStatusRetryDuration, podManager: podManager, - podStatusProvider: podStatusProvider, + podStateProvider: podStateProvider, desiredStateOfWorld: desiredStateOfWorld, actualStateOfWorld: actualStateOfWorld, pods: processedPods{ @@ -117,7 +122,7 @@ type desiredStateOfWorldPopulator struct { loopSleepDuration time.Duration getPodStatusRetryDuration time.Duration podManager pod.Manager - podStatusProvider status.PodStatusProvider + podStateProvider podStateProvider desiredStateOfWorld cache.DesiredStateOfWorld actualStateOfWorld cache.ActualStateOfWorld pods processedPods @@ -177,14 +182,6 @@ func (dswp *desiredStateOfWorldPopulator) populatorLoop() { dswp.findAndRemoveDeletedPods() } -func (dswp *desiredStateOfWorldPopulator) isPodTerminated(pod *v1.Pod) bool { - podStatus, found := dswp.podStatusProvider.GetPodStatus(pod.UID) - if !found { - podStatus = pod.Status - } - return util.IsPodTerminated(pod, podStatus) -} - // Iterate through all pods and add to desired state of world if they don't // exist but should func (dswp *desiredStateOfWorldPopulator) findAndAddNewPods() { @@ -203,8 +200,8 @@ func (dswp *desiredStateOfWorldPopulator) findAndAddNewPods() { processedVolumesForFSResize := sets.NewString() for _, pod := range dswp.podManager.GetPods() { - if dswp.isPodTerminated(pod) { - // Do not (re)add volumes for terminated pods + if dswp.podStateProvider.ShouldPodContainersBeTerminating(pod.UID) { + // Do not (re)add volumes for pods that can't also be starting containers continue } dswp.processPodVolumes(pod, mountedVolumesForPod, processedVolumesForFSResize) @@ -214,9 +211,6 @@ func (dswp *desiredStateOfWorldPopulator) findAndAddNewPods() { // Iterate through all pods in desired state of world, and remove if they no // longer exist func (dswp *desiredStateOfWorldPopulator) findAndRemoveDeletedPods() { - var runningPods []*kubecontainer.Pod - - runningPodsFetched := false for _, volumeToMount := range dswp.desiredStateOfWorld.GetVolumesToMount() { pod, podExists := dswp.podManager.GetPodByUID(volumeToMount.Pod.UID) if podExists { @@ -234,8 +228,8 @@ func (dswp *desiredStateOfWorldPopulator) findAndRemoveDeletedPods() { } } - // Skip running pods - if !dswp.isPodTerminated(pod) { + // Exclude known pods that we expect to be running + if !dswp.podStateProvider.ShouldPodRuntimeBeRemoved(pod.UID) { continue } if dswp.keepTerminatedPodVolumes { @@ -245,36 +239,9 @@ func (dswp *desiredStateOfWorldPopulator) findAndRemoveDeletedPods() { // Once a pod has been deleted from kubelet pod manager, do not delete // it immediately from volume manager. Instead, check the kubelet - // containerRuntime to verify that all containers in the pod have been + // pod state provider to verify that all containers in the pod have been // terminated. - if !runningPodsFetched { - var getPodsErr error - runningPods, getPodsErr = dswp.kubeContainerRuntime.GetPods(false) - if getPodsErr != nil { - klog.ErrorS(getPodsErr, "kubeContainerRuntime.findAndRemoveDeletedPods returned error") - continue - } - - runningPodsFetched = true - dswp.timeOfLastGetPodStatus = time.Now() - } - - runningContainers := false - for _, runningPod := range runningPods { - if runningPod.ID == volumeToMount.Pod.UID { - // runningPod.Containers only include containers in the running state, - // excluding containers in the creating process. - // By adding a non-empty judgment for runningPod.Sandboxes, - // ensure that all containers of the pod have been terminated. - if len(runningPod.Sandboxes) > 0 || len(runningPod.Containers) > 0 { - runningContainers = true - } - - break - } - } - - if runningContainers { + if !dswp.podStateProvider.ShouldPodRuntimeBeRemoved(volumeToMount.Pod.UID) { klog.V(4).InfoS("Pod still has one or more containers in the non-exited state and will not be removed from desired state", "pod", klog.KObj(volumeToMount.Pod)) continue } diff --git a/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator_test.go b/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator_test.go index 006f2fa8c60..1691b57c195 100644 --- a/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator_test.go +++ b/pkg/kubelet/volumemanager/populator/desired_state_of_world_populator_test.go @@ -27,6 +27,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + kubetypes "k8s.io/apimachinery/pkg/types" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" @@ -34,13 +35,10 @@ import ( csitrans "k8s.io/csi-translation-lib" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/configmap" - kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" kubepod "k8s.io/kubernetes/pkg/kubelet/pod" podtest "k8s.io/kubernetes/pkg/kubelet/pod/testing" "k8s.io/kubernetes/pkg/kubelet/secret" - "k8s.io/kubernetes/pkg/kubelet/status" - statustest "k8s.io/kubernetes/pkg/kubelet/status/testing" "k8s.io/kubernetes/pkg/kubelet/volumemanager/cache" "k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume/csimigration" @@ -81,7 +79,7 @@ func prepareDswpWithVolume(t *testing.T) (*desiredStateOfWorldPopulator, kubepod Phase: v1.ClaimBound, }, } - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) return dswp, fakePodManager } @@ -156,7 +154,7 @@ func TestFindAndAddNewPods_WithVolumeRetrievalError(t *testing.T) { } func TestFindAndAddNewPods_FindAndRemoveDeletedPods(t *testing.T) { - dswp, fakeRuntime, pod, expectedVolumeName, _ := prepareDSWPWithPodPV(t) + dswp, fakePodState, pod, expectedVolumeName, _ := prepareDSWPWithPodPV(t) podName := util.GetUniquePodName(pod) //let the pod be terminated @@ -167,29 +165,15 @@ func TestFindAndAddNewPods_FindAndRemoveDeletedPods(t *testing.T) { podGet.Status.Phase = v1.PodFailed dswp.podManager.DeletePod(pod) - fakeRuntime.PodList = []*containertest.FakePod{ - { - Pod: &kubecontainer.Pod{ - Name: pod.Name, - ID: pod.UID, - Sandboxes: []*kubecontainer.Container{ - { - Name: "dswp-test-pod-sandbox", - }, - }, - }, - }, - } - dswp.findAndRemoveDeletedPods() if !dswp.pods.processedPods[podName] { - t.Fatalf("Pod should not been removed from desired state of world since sandbox exist") + t.Fatalf("Pod should not been removed from desired state of world since pod state still thinks it exists") } - fakeRuntime.PodList = nil + fakePodState.removed = map[kubetypes.UID]struct{}{pod.UID: {}} - // fakeRuntime can not get the pod,so here findAndRemoveDeletedPods() will remove the pod and volumes it is mounted + // the pod state is marked as removed, so here findAndRemoveDeletedPods() will remove the pod and volumes it is mounted dswp.findAndRemoveDeletedPods() if dswp.pods.processedPods[podName] { t.Fatalf("Failed to remove pods from desired state of world since they no longer exist") @@ -221,7 +205,8 @@ func TestFindAndAddNewPods_FindAndRemoveDeletedPods(t *testing.T) { } func TestFindAndRemoveDeletedPodsWithActualState(t *testing.T) { - dswp, _, pod, expectedVolumeName, _ := prepareDSWPWithPodPV(t) + dswp, fakePodState, pod, expectedVolumeName, _ := prepareDSWPWithPodPV(t) + fakeASW := dswp.actualStateOfWorld podName := util.GetUniquePodName(pod) //let the pod be terminated @@ -251,7 +236,19 @@ func TestFindAndRemoveDeletedPodsWithActualState(t *testing.T) { // reconcile with actual state so that volume is added into the actual state // desired state populator now can successfully delete the pod and volume - fakeASW := dswp.actualStateOfWorld + reconcileASW(fakeASW, dswp.desiredStateOfWorld, t) + dswp.findAndRemoveDeletedPods() + if !dswp.desiredStateOfWorld.VolumeExists(expectedVolumeName) { + t.Fatalf( + "VolumeExists(%q) failed. Expected: Actual: <%v>", + expectedVolumeName, + volumeExists) + } + + fakePodState.removed = map[kubetypes.UID]struct{}{pod.UID: {}} + + // reconcile with actual state so that volume is added into the actual state + // desired state populator now can successfully delete the pod and volume reconcileASW(fakeASW, dswp.desiredStateOfWorld, t) dswp.findAndRemoveDeletedPods() volumeExists = dswp.desiredStateOfWorld.VolumeExists(expectedVolumeName) @@ -271,7 +268,7 @@ func TestFindAndRemoveDeletedPodsWithActualState(t *testing.T) { } func TestFindAndRemoveDeletedPodsWithUncertain(t *testing.T) { - dswp, fakeRuntime, pod, expectedVolumeName, pv := prepareDSWPWithPodPV(t) + dswp, fakePodState, pod, expectedVolumeName, pv := prepareDSWPWithPodPV(t) podName := util.GetUniquePodName(pod) //let the pod be terminated @@ -281,7 +278,7 @@ func TestFindAndRemoveDeletedPodsWithUncertain(t *testing.T) { } podGet.Status.Phase = v1.PodFailed dswp.podManager.DeletePod(pod) - fakeRuntime.PodList = nil + fakePodState.removed = map[kubetypes.UID]struct{}{pod.UID: {}} // Add the volume to ASW by reconciling. fakeASW := dswp.actualStateOfWorld @@ -302,7 +299,7 @@ func TestFindAndRemoveDeletedPodsWithUncertain(t *testing.T) { t.Fatalf("Failed to set the volume as uncertain: %s", err) } - // fakeRuntime can not get the pod,so here findAndRemoveDeletedPods() will remove the pod and volumes it is mounted + // the pod state now lists the pod as removed, so here findAndRemoveDeletedPods() will remove the pod and volumes it is mounted dswp.findAndRemoveDeletedPods() if dswp.pods.processedPods[podName] { t.Fatalf("Failed to remove pods from desired state of world since they no longer exist") @@ -333,7 +330,7 @@ func TestFindAndRemoveDeletedPodsWithUncertain(t *testing.T) { } } -func prepareDSWPWithPodPV(t *testing.T) (*desiredStateOfWorldPopulator, *containertest.FakeRuntime, *v1.Pod, v1.UniqueVolumeName, *v1.PersistentVolume) { +func prepareDSWPWithPodPV(t *testing.T) (*desiredStateOfWorldPopulator, *fakePodStateProvider, *v1.Pod, v1.UniqueVolumeName, *v1.PersistentVolume) { // create dswp mode := v1.PersistentVolumeFilesystem pv := &v1.PersistentVolume{ @@ -353,7 +350,7 @@ func prepareDSWPWithPodPV(t *testing.T) (*desiredStateOfWorldPopulator, *contain Phase: v1.ClaimBound, }, } - dswp, fakePodManager, fakesDSW, fakeRuntime := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, fakesDSW, _, fakePodState := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -399,7 +396,7 @@ func prepareDSWPWithPodPV(t *testing.T) (*desiredStateOfWorldPopulator, *contain verifyVolumeExistsInVolumesToMount( t, v1.UniqueVolumeName(generatedVolumeName), false /* expectReportedInUse */, fakesDSW) - return dswp, fakeRuntime, pod, expectedVolumeName, pv + return dswp, fakePodState, pod, expectedVolumeName, pv } func TestFindAndRemoveNonattachableVolumes(t *testing.T) { @@ -424,7 +421,7 @@ func TestFindAndRemoveNonattachableVolumes(t *testing.T) { } fakeVolumePluginMgr, fakeVolumePlugin := volumetesting.GetTestKubeletVolumePluginMgr(t) - dswp, fakePodManager, fakesDSW, _ := createDswpWithVolumeWithCustomPluginMgr(t, pv, pvc, fakeVolumePluginMgr) + dswp, fakePodManager, fakesDSW, _, _ := createDswpWithVolumeWithCustomPluginMgr(t, pv, pvc, fakeVolumePluginMgr) // create pod containers := []v1.Container{ @@ -487,7 +484,7 @@ func TestEphemeralVolumeOwnerCheck(t *testing.T) { // create dswp pod, pv, pvc := createEphemeralVolumeObjects("dswp-test-pod", "dswp-test-volume-name", false /* not owned */) - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) fakePodManager.AddPod(pod) podName := util.GetUniquePodName(pod) @@ -505,7 +502,7 @@ func TestEphemeralVolumeOwnerCheck(t *testing.T) { func TestEphemeralVolumeEnablement(t *testing.T) { // create dswp pod, pv, pvc := createEphemeralVolumeObjects("dswp-test-pod", "dswp-test-volume-name", true /* owned */) - dswp, fakePodManager, fakesDSW, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, fakesDSW, _, fakePodState := createDswpWithVolume(t, pv, pvc) fakePodManager.AddPod(pod) podName := util.GetUniquePodName(pod) @@ -555,7 +552,7 @@ func TestEphemeralVolumeEnablement(t *testing.T) { if !exist { t.Fatalf("Failed to get pod by pod name: %s and namespace: %s", pod.Name, pod.Namespace) } - podGet.Status.Phase = v1.PodFailed + fakePodState.removed = map[kubetypes.UID]struct{}{podGet.UID: {}} // Pretend again that the feature is disabled. // Removal of the pod and volumes is expected to work. @@ -620,7 +617,7 @@ func TestFindAndAddNewPods_FindAndRemoveDeletedPods_Valid_Block_VolumeDevices(t Phase: v1.ClaimBound, }, } - dswp, fakePodManager, fakesDSW, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, fakesDSW, _, fakePodState := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -674,9 +671,10 @@ func TestFindAndAddNewPods_FindAndRemoveDeletedPods_Valid_Block_VolumeDevices(t } podGet.Status.Phase = v1.PodFailed fakePodManager.DeletePod(pod) - //pod is added to fakePodManager but fakeRuntime can not get the pod,so here findAndRemoveDeletedPods() will remove the pod and volumes it is mounted - dswp.findAndRemoveDeletedPods() + fakePodState.removed = map[kubetypes.UID]struct{}{pod.UID: {}} + //pod is added to fakePodManager but pod state knows the pod is removed, so here findAndRemoveDeletedPods() will remove the pod and volumes it is mounted + dswp.findAndRemoveDeletedPods() if dswp.pods.processedPods[podName] { t.Fatalf("Failed to remove pods from desired state of world since they no longer exist") } @@ -727,7 +725,7 @@ func TestCreateVolumeSpec_Valid_File_VolumeMounts(t *testing.T) { Phase: v1.ClaimBound, }, } - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -773,7 +771,7 @@ func TestCreateVolumeSpec_Valid_Nil_VolumeMounts(t *testing.T) { Phase: v1.ClaimBound, }, } - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -819,7 +817,7 @@ func TestCreateVolumeSpec_Valid_Block_VolumeDevices(t *testing.T) { Phase: v1.ClaimBound, }, } - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -865,7 +863,7 @@ func TestCreateVolumeSpec_Invalid_File_VolumeDevices(t *testing.T) { Phase: v1.ClaimBound, }, } - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -911,7 +909,7 @@ func TestCreateVolumeSpec_Invalid_Block_VolumeMounts(t *testing.T) { Phase: v1.ClaimBound, }, } - dswp, fakePodManager, _, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, _, _, _ := createDswpWithVolume(t, pv, pvc) // create pod containers := []v1.Container{ @@ -1068,7 +1066,7 @@ func TestCheckVolumeFSResize(t *testing.T) { }, } - dswp, fakePodManager, fakeDSW, _ := createDswpWithVolume(t, pv, pvc) + dswp, fakePodManager, fakeDSW, _, _ := createDswpWithVolume(t, pv, pvc) fakeASW := dswp.actualStateOfWorld containers := []v1.Container{} @@ -1290,14 +1288,29 @@ func createEphemeralVolumeObjects(podName, volumeName string, owned bool) (pod * return } -func createDswpWithVolume(t *testing.T, pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim) (*desiredStateOfWorldPopulator, kubepod.Manager, cache.DesiredStateOfWorld, *containertest.FakeRuntime) { +func createDswpWithVolume(t *testing.T, pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim) (*desiredStateOfWorldPopulator, kubepod.Manager, cache.DesiredStateOfWorld, *containertest.FakeRuntime, *fakePodStateProvider) { fakeVolumePluginMgr, _ := volumetesting.GetTestKubeletVolumePluginMgr(t) - dswp, fakePodManager, fakesDSW, fakeRuntime := createDswpWithVolumeWithCustomPluginMgr(t, pv, pvc, fakeVolumePluginMgr) - return dswp, fakePodManager, fakesDSW, fakeRuntime + dswp, fakePodManager, fakesDSW, fakeRuntime, fakeStateProvider := createDswpWithVolumeWithCustomPluginMgr(t, pv, pvc, fakeVolumePluginMgr) + return dswp, fakePodManager, fakesDSW, fakeRuntime, fakeStateProvider +} + +type fakePodStateProvider struct { + terminating map[kubetypes.UID]struct{} + removed map[kubetypes.UID]struct{} +} + +func (p *fakePodStateProvider) ShouldPodContainersBeTerminating(uid kubetypes.UID) bool { + _, ok := p.terminating[uid] + return ok +} + +func (p *fakePodStateProvider) ShouldPodRuntimeBeRemoved(uid kubetypes.UID) bool { + _, ok := p.removed[uid] + return ok } func createDswpWithVolumeWithCustomPluginMgr(t *testing.T, pv *v1.PersistentVolume, pvc *v1.PersistentVolumeClaim, - fakeVolumePluginMgr *volume.VolumePluginMgr) (*desiredStateOfWorldPopulator, kubepod.Manager, cache.DesiredStateOfWorld, *containertest.FakeRuntime) { + fakeVolumePluginMgr *volume.VolumePluginMgr) (*desiredStateOfWorldPopulator, kubepod.Manager, cache.DesiredStateOfWorld, *containertest.FakeRuntime, *fakePodStateProvider) { fakeClient := &fake.Clientset{} fakeClient.AddReactor("get", "persistentvolumeclaims", func(action core.Action) (bool, runtime.Object, error) { return true, pvc, nil @@ -1314,8 +1327,7 @@ func createDswpWithVolumeWithCustomPluginMgr(t *testing.T, pv *v1.PersistentVolu fakesDSW := cache.NewDesiredStateOfWorld(fakeVolumePluginMgr) fakeASW := cache.NewActualStateOfWorld("fake", fakeVolumePluginMgr) fakeRuntime := &containertest.FakeRuntime{} - - fakeStatusManager := status.NewManager(fakeClient, fakePodManager, &statustest.FakePodDeletionSafetyProvider{}) + fakeStateProvider := &fakePodStateProvider{} csiTranslator := csitrans.New() dswp := &desiredStateOfWorldPopulator{ @@ -1323,7 +1335,7 @@ func createDswpWithVolumeWithCustomPluginMgr(t *testing.T, pv *v1.PersistentVolu loopSleepDuration: 100 * time.Millisecond, getPodStatusRetryDuration: 2 * time.Second, podManager: fakePodManager, - podStatusProvider: fakeStatusManager, + podStateProvider: fakeStateProvider, desiredStateOfWorld: fakesDSW, actualStateOfWorld: fakeASW, pods: processedPods{ @@ -1334,5 +1346,5 @@ func createDswpWithVolumeWithCustomPluginMgr(t *testing.T, pv *v1.PersistentVolu intreeToCSITranslator: csiTranslator, volumePluginMgr: fakeVolumePluginMgr, } - return dswp, fakePodManager, fakesDSW, fakeRuntime + return dswp, fakePodManager, fakesDSW, fakeRuntime, fakeStateProvider } diff --git a/pkg/kubelet/volumemanager/volume_manager.go b/pkg/kubelet/volumemanager/volume_manager.go index 1245f46679e..22d89583e54 100644 --- a/pkg/kubelet/volumemanager/volume_manager.go +++ b/pkg/kubelet/volumemanager/volume_manager.go @@ -39,7 +39,6 @@ import ( "k8s.io/kubernetes/pkg/kubelet/config" "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/pod" - "k8s.io/kubernetes/pkg/kubelet/status" "k8s.io/kubernetes/pkg/kubelet/volumemanager/cache" "k8s.io/kubernetes/pkg/kubelet/volumemanager/metrics" "k8s.io/kubernetes/pkg/kubelet/volumemanager/populator" @@ -106,6 +105,13 @@ type VolumeManager interface { // the duration defined in podAttachAndMountTimeout. WaitForAttachAndMount(pod *v1.Pod) error + // WaitForUnmount processes the volumes referenced in the specified + // pod and blocks until they are all unmounted (reflected in the actual + // state of the world). + // An error is returned if all volumes are not unmounted within + // the duration defined in podAttachAndMountTimeout. + WaitForUnmount(pod *v1.Pod) error + // GetMountedVolumesForPod returns a VolumeMap containing the volumes // referenced by the specified pod that are successfully attached and // mounted. The key in the map is the OuterVolumeSpecName (i.e. @@ -149,6 +155,12 @@ type VolumeManager interface { MarkVolumesAsReportedInUse(volumesReportedAsInUse []v1.UniqueVolumeName) } +// podStateProvider can determine if a pod is is going to be terminated +type podStateProvider interface { + ShouldPodContainersBeTerminating(k8stypes.UID) bool + ShouldPodRuntimeBeRemoved(k8stypes.UID) bool +} + // NewVolumeManager returns a new concrete instance implementing the // VolumeManager interface. // @@ -160,7 +172,7 @@ func NewVolumeManager( controllerAttachDetachEnabled bool, nodeName k8stypes.NodeName, podManager pod.Manager, - podStatusProvider status.PodStatusProvider, + podStateProvider podStateProvider, kubeClient clientset.Interface, volumePluginMgr *volume.VolumePluginMgr, kubeContainerRuntime container.Runtime, @@ -195,7 +207,7 @@ func NewVolumeManager( desiredStateOfWorldPopulatorLoopSleepPeriod, desiredStateOfWorldPopulatorGetPodStatusRetryDuration, podManager, - podStatusProvider, + podStateProvider, vm.desiredStateOfWorld, vm.actualStateOfWorld, kubeContainerRuntime, @@ -426,6 +438,42 @@ func (vm *volumeManager) WaitForAttachAndMount(pod *v1.Pod) error { return nil } +func (vm *volumeManager) WaitForUnmount(pod *v1.Pod) error { + if pod == nil { + return nil + } + + klog.V(3).InfoS("Waiting for volumes to unmount for pod", "pod", klog.KObj(pod)) + uniquePodName := util.GetUniquePodName(pod) + + vm.desiredStateOfWorldPopulator.ReprocessPod(uniquePodName) + + err := wait.PollImmediate( + podAttachAndMountRetryInterval, + podAttachAndMountTimeout, + vm.verifyVolumesUnmountedFunc(uniquePodName)) + + if err != nil { + var mountedVolumes []string + for _, v := range vm.actualStateOfWorld.GetMountedVolumesForPod(uniquePodName) { + mountedVolumes = append(mountedVolumes, v.OuterVolumeSpecName) + } + sort.Strings(mountedVolumes) + + if len(mountedVolumes) == 0 { + return nil + } + + return fmt.Errorf( + "mounted volumes=%v: %s", + mountedVolumes, + err) + } + + klog.V(3).InfoS("All volumes are unmounted for pod", "pod", klog.KObj(pod)) + return nil +} + // getUnattachedVolumes returns a list of the volumes that are expected to be attached but // are not currently attached to the node func (vm *volumeManager) getUnattachedVolumes(expectedVolumes []string) []string { @@ -449,6 +497,17 @@ func (vm *volumeManager) verifyVolumesMountedFunc(podName types.UniquePodName, e } } +// verifyVolumesUnmountedFunc returns a method that is true when there are no mounted volumes for this +// pod. +func (vm *volumeManager) verifyVolumesUnmountedFunc(podName types.UniquePodName) wait.ConditionFunc { + return func() (done bool, err error) { + if errs := vm.desiredStateOfWorld.PopPodErrors(podName); len(errs) > 0 { + return true, errors.New(strings.Join(errs, "; ")) + } + return len(vm.actualStateOfWorld.GetMountedVolumesForPod(podName)) == 0, nil + } +} + // getUnmountedVolumes fetches the current list of mounted volumes from // the actual state of the world, and uses it to process the list of // expectedVolumes. It returns a list of unmounted volumes. diff --git a/pkg/kubelet/volumemanager/volume_manager_fake.go b/pkg/kubelet/volumemanager/volume_manager_fake.go index ad4e87e7ad8..e3e56b4d853 100644 --- a/pkg/kubelet/volumemanager/volume_manager_fake.go +++ b/pkg/kubelet/volumemanager/volume_manager_fake.go @@ -50,6 +50,11 @@ func (f *FakeVolumeManager) WaitForAttachAndMount(pod *v1.Pod) error { return nil } +// WaitForUnmount is not implemented +func (f *FakeVolumeManager) WaitForUnmount(pod *v1.Pod) error { + return nil +} + // GetMountedVolumesForPod is not implemented func (f *FakeVolumeManager) GetMountedVolumesForPod(podName types.UniquePodName) container.VolumeMap { return nil diff --git a/pkg/kubelet/volumemanager/volume_manager_test.go b/pkg/kubelet/volumemanager/volume_manager_test.go index 754c3c5eeed..cde1fe848f9 100644 --- a/pkg/kubelet/volumemanager/volume_manager_test.go +++ b/pkg/kubelet/volumemanager/volume_manager_test.go @@ -28,6 +28,7 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubetypes "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" @@ -40,8 +41,6 @@ import ( kubepod "k8s.io/kubernetes/pkg/kubelet/pod" podtest "k8s.io/kubernetes/pkg/kubelet/pod/testing" "k8s.io/kubernetes/pkg/kubelet/secret" - "k8s.io/kubernetes/pkg/kubelet/status" - statustest "k8s.io/kubernetes/pkg/kubelet/status/testing" "k8s.io/kubernetes/pkg/volume" volumetest "k8s.io/kubernetes/pkg/volume/testing" "k8s.io/kubernetes/pkg/volume/util" @@ -269,19 +268,34 @@ func TestGetExtraSupplementalGroupsForPod(t *testing.T) { } } +type fakePodStateProvider struct { + shouldRemove map[kubetypes.UID]struct{} + terminating map[kubetypes.UID]struct{} +} + +func (p *fakePodStateProvider) ShouldPodRuntimeBeRemoved(uid kubetypes.UID) bool { + _, ok := p.shouldRemove[uid] + return ok +} + +func (p *fakePodStateProvider) ShouldPodContainersBeTerminating(uid kubetypes.UID) bool { + _, ok := p.terminating[uid] + return ok +} + func newTestVolumeManager(t *testing.T, tmpDir string, podManager kubepod.Manager, kubeClient clientset.Interface) VolumeManager { plug := &volumetest.FakeVolumePlugin{PluginName: "fake", Host: nil} fakeRecorder := &record.FakeRecorder{} plugMgr := &volume.VolumePluginMgr{} // TODO (#51147) inject mock prober plugMgr.InitPlugins([]volume.VolumePlugin{plug}, nil /* prober */, volumetest.NewFakeKubeletVolumeHost(t, tmpDir, kubeClient, nil)) - statusManager := status.NewManager(kubeClient, podManager, &statustest.FakePodDeletionSafetyProvider{}) + stateProvider := &fakePodStateProvider{} fakePathHandler := volumetest.NewBlockVolumePathHandler() vm := NewVolumeManager( true, testHostname, podManager, - statusManager, + stateProvider, kubeClient, plugMgr, &containertest.FakeRuntime{}, diff --git a/pkg/volume/util/util.go b/pkg/volume/util/util.go index 5dba2c932ec..702f136e0de 100644 --- a/pkg/volume/util/util.go +++ b/pkg/volume/util/util.go @@ -381,7 +381,10 @@ func GetUniqueVolumeNameFromSpec( // IsPodTerminated checks if pod is terminated func IsPodTerminated(pod *v1.Pod, podStatus v1.PodStatus) bool { - return podStatus.Phase == v1.PodFailed || podStatus.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(podStatus.ContainerStatuses)) + // TODO: the guarantees provided by kubelet status are not sufficient to guarantee it's safe to ignore a deleted pod, + // even if everything is notRunning (kubelet does not guarantee that when pod status is waiting that it isn't trying + // to start a container). + return podStatus.Phase == v1.PodFailed || podStatus.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(podStatus.InitContainerStatuses) && notRunning(podStatus.ContainerStatuses) && notRunning(podStatus.EphemeralContainerStatuses)) } // notRunning returns true if every status is terminated or waiting, or the status list diff --git a/test/e2e/apps/statefulset.go b/test/e2e/apps/statefulset.go index 2bbd7474d43..92358da1dce 100644 --- a/test/e2e/apps/statefulset.go +++ b/test/e2e/apps/statefulset.go @@ -756,6 +756,10 @@ var _ = SIGDescribe("StatefulSet", func() { } pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(context.TODO(), pod, metav1.CreateOptions{}) framework.ExpectNoError(err) + ginkgo.By("Waiting until pod " + podName + " will start running in namespace " + f.Namespace.Name) + if err := e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, podName, f.Namespace.Name); err != nil { + framework.Failf("Pod %v did not start running: %v", podName, err) + } ginkgo.By("Creating statefulset with conflicting port in namespace " + f.Namespace.Name) ss := e2estatefulset.NewStatefulSet(ssName, f.Namespace.Name, headlessSvcName, 1, nil, nil, labels) @@ -765,11 +769,6 @@ var _ = SIGDescribe("StatefulSet", func() { _, err = f.ClientSet.AppsV1().StatefulSets(f.Namespace.Name).Create(context.TODO(), ss, metav1.CreateOptions{}) framework.ExpectNoError(err) - ginkgo.By("Waiting until pod " + podName + " will start running in namespace " + f.Namespace.Name) - if err := e2epod.WaitForPodNameRunningInNamespace(f.ClientSet, podName, f.Namespace.Name); err != nil { - framework.Failf("Pod %v did not start running: %v", podName, err) - } - var initialStatefulPodUID types.UID ginkgo.By("Waiting until stateful pod " + statefulPodName + " will be recreated and deleted at least once in namespace " + f.Namespace.Name) diff --git a/test/e2e/node/pods.go b/test/e2e/node/pods.go index 70fa4fbe082..345eb2c76d3 100644 --- a/test/e2e/node/pods.go +++ b/test/e2e/node/pods.go @@ -387,6 +387,8 @@ var _ = SIGDescribe("Pods Extended", func() { // pod volume teardown races with container start in CRI, which reports a failure framework.Logf("pod %s on node %s failed with the symptoms of https://github.com/kubernetes/kubernetes/issues/88766", pod.Name, pod.Spec.NodeName) default: + data, _ := json.MarshalIndent(pod.Status, "", " ") + framework.Logf("pod %s on node %s had incorrect final status:\n%s", string(data)) return fmt.Errorf("pod %s on node %s container unexpected exit code %d: start=%s end=%s reason=%s message=%s", pod.Name, pod.Spec.NodeName, t.ExitCode, t.StartedAt, t.FinishedAt, t.Reason, t.Message) } switch {