Merge pull request #41644 from derekwaynecarr/ensure-pod-cgroup-deleted

Automatic merge from submit-queue (batch tested with PRs 41644, 42020, 41753, 42206, 42212) Ensure pod cgroup is deleted prior to deletion of pod **What this PR does / why we need it**: This PR ensures that the kubelet removes the pod cgroup sandbox prior to deletion of a pod from the apiserver. We need this to ensure that the default behavior in the kubelet is to not leak resources.
2025-07-27 05:27:21 +00:00 · 2017-03-01 15:30:30 -08:00 · 2017-03-01 15:30:30 -08:00 · ddd8b5c1cf
commit ddd8b5c1cf
parent bce0718c2d 21a899cf85
1 changed files with 30 additions and 44 deletions
--- a/pkg/kubelet/kubelet_pods.go
+++ b/pkg/kubelet/kubelet_pods.go
@ -632,35 +632,10 @@ func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *k
 		return fmt.Errorf("one of the two arguments must be non-nil: runningPod, status")
 	}
 	// cache the pod cgroup Name for reducing the cpu resource limits of the pod cgroup once the pod is killed
 	pcm := kl.containerManager.NewPodContainerManager()
 	var podCgroup cm.CgroupName
 	reduceCpuLimits := true
 	if pod != nil {
 		podCgroup, _ = pcm.GetPodContainerName(pod)
 	} else {
 		// If the pod is nil then cgroup limit must have already
 		// been decreased earlier
 		reduceCpuLimits = false
 	}
 	// Call the container runtime KillPod method which stops all running containers of the pod
 	if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil {
 		return err
 	}
 	// At this point the pod might not completely free up cpu and memory resources.
 	// In such a case deleting the pod's cgroup might cause the pod's charges to be transferred
 	// to the parent cgroup. There might be various kinds of pod charges at this point.
 	// For example, any volume used by the pod that was backed by memory will have its
 	// pages charged to the pod cgroup until those volumes are removed by the kubelet.
 	// Hence we only reduce the cpu resource limits of the pod's cgroup
 	// and defer the responsibilty of destroying the pod's cgroup to the
 	// cleanup method and the housekeeping loop.
 	if reduceCpuLimits {
 		if err := pcm.ReduceCPULimits(podCgroup); err != nil {
 			glog.Warningf("Failed to reduce the CPU values to the minimum amount of shares: %v", err)
 		}
 	}
 	if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
 		glog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err)
 	}
@ -718,8 +693,9 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
 	return false
 }
-// Returns true if all required node-level resources that a pod was consuming have been reclaimed by the kubelet.
+// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
-// Reclaiming resources is a prerequisite to deleting a pod from the API server.
+// been reclaimed by the kubelet.  Reclaiming resources is a prerequisite to deleting a pod from the
 // API server.
 func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
 	if pod.DeletionTimestamp == nil {
 		// We shouldnt delete pods whose DeletionTimestamp is not set
@ -735,6 +711,13 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
 		glog.V(3).Infof("Pod %q is terminated, but some volumes have not been cleaned up", format.Pod(pod))
 		return false
 	}
 	if kl.kubeletConfiguration.CgroupsPerQOS {
 		pcm := kl.containerManager.NewPodContainerManager()
 		if pcm.Exists(pod) {
 			glog.V(3).Infof("Pod %q is terminated, but pod cgroup sandbox has not been cleaned up", format.Pod(pod))
 			return false
 		}
 	}
 	return true
 }
@ -862,9 +845,9 @@ func (kl *Kubelet) HandlePodCleanups() error {
 		glog.Errorf("Failed cleaning up bandwidth limits: %v", err)
 	}
-	// Remove any cgroups in the hierarchy for pods that should no longer exist
+	// Remove any cgroups in the hierarchy for pods that are no longer running.
 	if kl.cgroupsPerQOS {
-		kl.cleanupOrphanedPodCgroups(cgroupPods, allPods, runningPods)
+		kl.cleanupOrphanedPodCgroups(cgroupPods, runningPods)
 	}
 	kl.backOff.GC()
@ -1519,31 +1502,34 @@ func (kl *Kubelet) GetPortForward(podName, podNamespace string, podUID types.UID
 	}
 }
-// cleanupOrphanedPodCgroups removes the Cgroups of pods that should not be
+// cleanupOrphanedPodCgroups removes cgroups that should no longer exist.
-// running and whose volumes have been cleaned up.
+// it reconciles the cached state of cgroupPods with the specified list of runningPods
-func (kl *Kubelet) cleanupOrphanedPodCgroups(
+func (kl *Kubelet) cleanupOrphanedPodCgroups(cgroupPods map[types.UID]cm.CgroupName, runningPods []*kubecontainer.Pod) {
-	cgroupPods map[types.UID]cm.CgroupName,
+	// Add all running pods to the set that we want to preserve
-	pods []*v1.Pod, runningPods []*kubecontainer.Pod) {
+	podSet := sets.NewString()
 	// Add all running and existing terminated pods to a set allPods
 	allPods := sets.NewString()
 	for _, pod := range pods {
 		allPods.Insert(string(pod.UID))
 	}
 	for _, pod := range runningPods {
-		allPods.Insert(string(pod.ID))
+		podSet.Insert(string(pod.ID))
 	}
 	pcm := kl.containerManager.NewPodContainerManager()
 	// Iterate over all the found pods to verify if they should be running
 	for uid, val := range cgroupPods {
-		if allPods.Has(string(uid)) {
+		// if the pod is in the running set, its not a candidate for cleanup
 		if podSet.Has(string(uid)) {
 			continue
 		}
-		// If volumes have not been unmounted/detached, do not delete the cgroup in case so the charge does not go to the parent.
+		// If volumes have not been unmounted/detached, do not delete the cgroup
-		if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
+		// so any memory backed volumes don't have their charges propagated to the
-			glog.V(3).Infof("Orphaned pod %q found, but volumes are not cleaned up, Skipping cgroups deletion.", uid)
+		// parent croup.  If the volumes still exist, reduce the cpu shares for any
 		// process in the cgroup to the minimum value while we wait.  if the kubelet
 		// is configured to keep terminated volumes, we will delete the cgroup and not block.
 		if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist && !kl.kubeletConfiguration.KeepTerminatedPodVolumes {
 			glog.V(3).Infof("Orphaned pod %q found, but volumes not yet removed.  Reducing cpu to minimum", uid)
 			if err := pcm.ReduceCPULimits(val); err != nil {
 				glog.Warningf("Failed to reduce cpu time for pod %q pending volume cleanup due to %v", uid, err)
 			}
 			continue
 		}
 		glog.V(3).Infof("Orphaned pod %q found, removing pod cgroups", uid)