mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Merge pull request #41644 from derekwaynecarr/ensure-pod-cgroup-deleted
Automatic merge from submit-queue (batch tested with PRs 41644, 42020, 41753, 42206, 42212) Ensure pod cgroup is deleted prior to deletion of pod **What this PR does / why we need it**: This PR ensures that the kubelet removes the pod cgroup sandbox prior to deletion of a pod from the apiserver. We need this to ensure that the default behavior in the kubelet is to not leak resources.
This commit is contained in:
commit
ddd8b5c1cf
@ -632,35 +632,10 @@ func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *k
|
|||||||
return fmt.Errorf("one of the two arguments must be non-nil: runningPod, status")
|
return fmt.Errorf("one of the two arguments must be non-nil: runningPod, status")
|
||||||
}
|
}
|
||||||
|
|
||||||
// cache the pod cgroup Name for reducing the cpu resource limits of the pod cgroup once the pod is killed
|
|
||||||
pcm := kl.containerManager.NewPodContainerManager()
|
|
||||||
var podCgroup cm.CgroupName
|
|
||||||
reduceCpuLimits := true
|
|
||||||
if pod != nil {
|
|
||||||
podCgroup, _ = pcm.GetPodContainerName(pod)
|
|
||||||
} else {
|
|
||||||
// If the pod is nil then cgroup limit must have already
|
|
||||||
// been decreased earlier
|
|
||||||
reduceCpuLimits = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Call the container runtime KillPod method which stops all running containers of the pod
|
// Call the container runtime KillPod method which stops all running containers of the pod
|
||||||
if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil {
|
if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// At this point the pod might not completely free up cpu and memory resources.
|
|
||||||
// In such a case deleting the pod's cgroup might cause the pod's charges to be transferred
|
|
||||||
// to the parent cgroup. There might be various kinds of pod charges at this point.
|
|
||||||
// For example, any volume used by the pod that was backed by memory will have its
|
|
||||||
// pages charged to the pod cgroup until those volumes are removed by the kubelet.
|
|
||||||
// Hence we only reduce the cpu resource limits of the pod's cgroup
|
|
||||||
// and defer the responsibilty of destroying the pod's cgroup to the
|
|
||||||
// cleanup method and the housekeeping loop.
|
|
||||||
if reduceCpuLimits {
|
|
||||||
if err := pcm.ReduceCPULimits(podCgroup); err != nil {
|
|
||||||
glog.Warningf("Failed to reduce the CPU values to the minimum amount of shares: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
|
if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
|
||||||
glog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err)
|
glog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err)
|
||||||
}
|
}
|
||||||
@ -718,8 +693,9 @@ func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if all required node-level resources that a pod was consuming have been reclaimed by the kubelet.
|
// OkToDeletePod returns true if all required node-level resources that a pod was consuming have
|
||||||
// Reclaiming resources is a prerequisite to deleting a pod from the API server.
|
// been reclaimed by the kubelet. Reclaiming resources is a prerequisite to deleting a pod from the
|
||||||
|
// API server.
|
||||||
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
||||||
if pod.DeletionTimestamp == nil {
|
if pod.DeletionTimestamp == nil {
|
||||||
// We shouldnt delete pods whose DeletionTimestamp is not set
|
// We shouldnt delete pods whose DeletionTimestamp is not set
|
||||||
@ -735,6 +711,13 @@ func (kl *Kubelet) OkToDeletePod(pod *v1.Pod) bool {
|
|||||||
glog.V(3).Infof("Pod %q is terminated, but some volumes have not been cleaned up", format.Pod(pod))
|
glog.V(3).Infof("Pod %q is terminated, but some volumes have not been cleaned up", format.Pod(pod))
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
if kl.kubeletConfiguration.CgroupsPerQOS {
|
||||||
|
pcm := kl.containerManager.NewPodContainerManager()
|
||||||
|
if pcm.Exists(pod) {
|
||||||
|
glog.V(3).Infof("Pod %q is terminated, but pod cgroup sandbox has not been cleaned up", format.Pod(pod))
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -862,9 +845,9 @@ func (kl *Kubelet) HandlePodCleanups() error {
|
|||||||
glog.Errorf("Failed cleaning up bandwidth limits: %v", err)
|
glog.Errorf("Failed cleaning up bandwidth limits: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove any cgroups in the hierarchy for pods that should no longer exist
|
// Remove any cgroups in the hierarchy for pods that are no longer running.
|
||||||
if kl.cgroupsPerQOS {
|
if kl.cgroupsPerQOS {
|
||||||
kl.cleanupOrphanedPodCgroups(cgroupPods, allPods, runningPods)
|
kl.cleanupOrphanedPodCgroups(cgroupPods, runningPods)
|
||||||
}
|
}
|
||||||
|
|
||||||
kl.backOff.GC()
|
kl.backOff.GC()
|
||||||
@ -1519,31 +1502,34 @@ func (kl *Kubelet) GetPortForward(podName, podNamespace string, podUID types.UID
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// cleanupOrphanedPodCgroups removes the Cgroups of pods that should not be
|
// cleanupOrphanedPodCgroups removes cgroups that should no longer exist.
|
||||||
// running and whose volumes have been cleaned up.
|
// it reconciles the cached state of cgroupPods with the specified list of runningPods
|
||||||
func (kl *Kubelet) cleanupOrphanedPodCgroups(
|
func (kl *Kubelet) cleanupOrphanedPodCgroups(cgroupPods map[types.UID]cm.CgroupName, runningPods []*kubecontainer.Pod) {
|
||||||
cgroupPods map[types.UID]cm.CgroupName,
|
// Add all running pods to the set that we want to preserve
|
||||||
pods []*v1.Pod, runningPods []*kubecontainer.Pod) {
|
podSet := sets.NewString()
|
||||||
// Add all running and existing terminated pods to a set allPods
|
|
||||||
allPods := sets.NewString()
|
|
||||||
for _, pod := range pods {
|
|
||||||
allPods.Insert(string(pod.UID))
|
|
||||||
}
|
|
||||||
for _, pod := range runningPods {
|
for _, pod := range runningPods {
|
||||||
allPods.Insert(string(pod.ID))
|
podSet.Insert(string(pod.ID))
|
||||||
}
|
}
|
||||||
|
|
||||||
pcm := kl.containerManager.NewPodContainerManager()
|
pcm := kl.containerManager.NewPodContainerManager()
|
||||||
|
|
||||||
// Iterate over all the found pods to verify if they should be running
|
// Iterate over all the found pods to verify if they should be running
|
||||||
for uid, val := range cgroupPods {
|
for uid, val := range cgroupPods {
|
||||||
if allPods.Has(string(uid)) {
|
// if the pod is in the running set, its not a candidate for cleanup
|
||||||
|
if podSet.Has(string(uid)) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// If volumes have not been unmounted/detached, do not delete the cgroup in case so the charge does not go to the parent.
|
// If volumes have not been unmounted/detached, do not delete the cgroup
|
||||||
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
|
// so any memory backed volumes don't have their charges propagated to the
|
||||||
glog.V(3).Infof("Orphaned pod %q found, but volumes are not cleaned up, Skipping cgroups deletion.", uid)
|
// parent croup. If the volumes still exist, reduce the cpu shares for any
|
||||||
|
// process in the cgroup to the minimum value while we wait. if the kubelet
|
||||||
|
// is configured to keep terminated volumes, we will delete the cgroup and not block.
|
||||||
|
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist && !kl.kubeletConfiguration.KeepTerminatedPodVolumes {
|
||||||
|
glog.V(3).Infof("Orphaned pod %q found, but volumes not yet removed. Reducing cpu to minimum", uid)
|
||||||
|
if err := pcm.ReduceCPULimits(val); err != nil {
|
||||||
|
glog.Warningf("Failed to reduce cpu time for pod %q pending volume cleanup due to %v", uid, err)
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
glog.V(3).Infof("Orphaned pod %q found, removing pod cgroups", uid)
|
glog.V(3).Infof("Orphaned pod %q found, removing pod cgroups", uid)
|
||||||
|
Loading…
Reference in New Issue
Block a user