Merge pull request #45896 from dashpole/disk_pressure_reclaim

Automatic merge from submit-queue Delete all dead containers and sandboxes when under disk pressure. This PR modifies the eviction manager to add dead container and sandbox garbage collection as a resource reclaim function for disk. It also modifies the container GC logic to allow pods that are terminated, but not deleted to be removed. It still does not delete containers that are less than the minGcAge. This should prevent nodes from entering a permanently bad state if the entire disk is occupied by pods that are terminated (in the state failed, or succeeded), but not deleted. There are two improvements we should consider making in the future: - Track the disk space and inodes reclaimed by deleting containers. We currently do not track this, and it prevents us from determining if deleting containers resolves disk pressure. So we may still evict a pod even if we are able to free disk space by deleting dead containers. - Once we can track disk space and inodes reclaimed, we should consider only deleting the containers we need to in order to relieve disk pressure. This should help avoid a scenario where we try and delete a massive number of containers all at once, and overwhelm the runtime. /assign @vishh cc @derekwaynecarr ```release-note Disk Pressure triggers the deletion of terminated containers on the node. ```
2025-07-31 07:20:13 +00:00 · 2017-06-03 23:43:46 -07:00 · 2017-06-03 23:43:46 -07:00 · 3fdf6c3d14
commit 3fdf6c3d14
parent 8929a73a6f 889afa5e2d
16 changed files with 193 additions and 105 deletions
--- a/pkg/kubelet/container/container_gc.go
+++ b/pkg/kubelet/container/container_gc.go
@ -39,7 +39,15 @@ type ContainerGCPolicy struct {
 // Implementation is thread-compatible.
 type ContainerGC interface {
 	// Garbage collect containers.
-	GarbageCollect(allSourcesReady bool) error
+	GarbageCollect() error
+	// Deletes all unused containers, including containers belonging to pods that are terminated but not deleted
+	DeleteAllUnusedContainers() error
+}
+
+// SourcesReadyProvider knows how to determine if configuration sources are ready
+type SourcesReadyProvider interface {
+	// AllReady returns true if the currently configured sources have all been seen.
+	AllReady() bool
 }

 // TODO(vmarmol): Preferentially remove pod infra containers.
@ -49,20 +57,28 @@ type realContainerGC struct {

 	// Policy for garbage collection.
 	policy ContainerGCPolicy
+
+	// sourcesReadyProvider provides the readyness of kubelet configuration sources.
+	sourcesReadyProvider SourcesReadyProvider
 }

 // New ContainerGC instance with the specified policy.
-func NewContainerGC(runtime Runtime, policy ContainerGCPolicy) (ContainerGC, error) {
+func NewContainerGC(runtime Runtime, policy ContainerGCPolicy, sourcesReadyProvider SourcesReadyProvider) (ContainerGC, error) {
 	if policy.MinAge < 0 {
 		return nil, fmt.Errorf("invalid minimum garbage collection age: %v", policy.MinAge)
 	}

 	return &realContainerGC{
-		runtime: runtime,
-		policy:  policy,
+		runtime:              runtime,
+		policy:               policy,
+		sourcesReadyProvider: sourcesReadyProvider,
 	}, nil
 }

-func (cgc *realContainerGC) GarbageCollect(allSourcesReady bool) error {
-	return cgc.runtime.GarbageCollect(cgc.policy, allSourcesReady)
+func (cgc *realContainerGC) GarbageCollect() error {
+	return cgc.runtime.GarbageCollect(cgc.policy, cgc.sourcesReadyProvider.AllReady(), false)
+}
+
+func (cgc *realContainerGC) DeleteAllUnusedContainers() error {
+	return cgc.runtime.GarbageCollect(cgc.policy, cgc.sourcesReadyProvider.AllReady(), true)
 }
--- a/pkg/kubelet/container/runtime.go
+++ b/pkg/kubelet/container/runtime.go
@ -82,8 +82,10 @@ type Runtime interface {
 	// complete list of pods from all avialble sources (e.g., apiserver, http,
 	// file). In this case, garbage collector should refrain itself from aggressive
 	// behavior such as removing all containers of unrecognized pods (yet).
+	// If evictNonDeletedPods is set to true, containers and sandboxes belonging to pods
+	// that are terminated, but not deleted will be evicted.  Otherwise, only deleted pods will be GC'd.
 	// TODO: Revisit this method and make it cleaner.
-	GarbageCollect(gcPolicy ContainerGCPolicy, allSourcesReady bool) error
+	GarbageCollect(gcPolicy ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error
 	// Syncs the running pod into the desired pod.
 	SyncPod(pod *v1.Pod, apiPodStatus v1.PodStatus, podStatus *PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) PodSyncResult
 	// KillPod kills all the containers of a pod. Pod may be nil, running pod must not be.
--- a/pkg/kubelet/container/testing/fake_runtime.go
+++ b/pkg/kubelet/container/testing/fake_runtime.go
@ -431,7 +431,7 @@ func (f *FakeRuntime) GetPodContainerID(pod *Pod) (ContainerID, error) {
 	return ContainerID{}, f.Err
 }

-func (f *FakeRuntime) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool) error {
+func (f *FakeRuntime) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool, evictNonDeletedPods bool) error {
 	f.Lock()
 	defer f.Unlock()

--- a/pkg/kubelet/container/testing/runtime_mock.go
+++ b/pkg/kubelet/container/testing/runtime_mock.go
@ -140,8 +140,8 @@ func (r *Mock) GetPodContainerID(pod *Pod) (ContainerID, error) {
 	return ContainerID{}, args.Error(0)
 }

-func (r *Mock) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool) error {
-	args := r.Called(gcPolicy, ready)
+func (r *Mock) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool, evictNonDeletedPods bool) error {
+	args := r.Called(gcPolicy, ready, evictNonDeletedPods)
 	return args.Error(0)
 }

--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -58,6 +58,8 @@ type managerImpl struct {
 	killPodFunc KillPodFunc
 	// the interface that knows how to do image gc
 	imageGC ImageGC
+	// the interface that knows how to do image gc
+	containerGC ContainerGC
 	// protects access to internal state
 	sync.RWMutex
 	// node conditions are the set of conditions present
@ -95,6 +97,7 @@ func NewManager(
 	config Config,
 	killPodFunc KillPodFunc,
 	imageGC ImageGC,
+	containerGC ContainerGC,
 	recorder record.EventRecorder,
 	nodeRef *clientv1.ObjectReference,
 	clock clock.Clock) (Manager, lifecycle.PodAdmitHandler) {
@ -102,6 +105,7 @@ func NewManager(
 		clock:           clock,
 		killPodFunc:     killPodFunc,
 		imageGC:         imageGC,
+		containerGC:     containerGC,
 		config:          config,
 		recorder:        recorder,
 		summaryProvider: summaryProvider,
@ -223,8 +227,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		}
 		m.dedicatedImageFs = &hasImageFs
 		m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
-		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasImageFs)
-
+		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
 	}

 	activePods := podFunc()
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@ -77,17 +77,24 @@ func (m *mockNodeProvider) GetNode() (*v1.Node, error) {
 	return &m.node, nil
 }

-// mockImageGC is used to simulate invoking image garbage collection.
-type mockImageGC struct {
-	err     error
-	freed   int64
-	invoked bool
+// mockDiskGC is used to simulate invoking image and container garbage collection.
+type mockDiskGC struct {
+	err                error
+	imageBytesFreed    int64
+	imageGCInvoked     bool
+	containerGCInvoked bool
 }

 // DeleteUnusedImages returns the mocked values.
-func (m *mockImageGC) DeleteUnusedImages() (int64, error) {
-	m.invoked = true
-	return m.freed, m.err
+func (m *mockDiskGC) DeleteUnusedImages() (int64, error) {
+	m.imageGCInvoked = true
+	return m.imageBytesFreed, m.err
+}
+
+// DeleteAllUnusedContainers returns the mocked value
+func (m *mockDiskGC) DeleteAllUnusedContainers() error {
+	m.containerGCInvoked = true
+	return m.err
 }

 func makePodWithMemoryStats(name string, requests v1.ResourceList, limits v1.ResourceList, memoryWorkingSet string) (*v1.Pod, statsapi.PodStats) {
@ -194,7 +201,7 @@ func TestMemoryPressure(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	imageGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -412,7 +419,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -440,7 +447,8 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -610,7 +618,7 @@ func TestMinReclaim(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -633,7 +641,8 @@ func TestMinReclaim(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -750,7 +759,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGcFree := resource.MustParse("700Mi")
-	imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: imageGcFree.Value(), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -773,7 +782,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -801,7 +811,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// verify image gc was invoked
-	if !imageGC.invoked {
+	if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
 		t.Errorf("Manager should have invoked image gc")
 	}

@ -811,7 +821,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// reset state
-	imageGC.invoked = false
+	diskGC.imageGCInvoked = false
+	diskGC.containerGCInvoked = false

 	// remove disk pressure
 	fakeClock.Step(20 * time.Minute)
@ -833,8 +844,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 		t.Errorf("Manager should report disk pressure")
 	}

-	// ensure image gc was invoked
-	if !imageGC.invoked {
+	// ensure disk gc was invoked
+	if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
 		t.Errorf("Manager should have invoked image gc")
 	}

@ -850,8 +861,9 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// reduce disk pressure
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
-	imageGC.invoked = false // reset state
-	podKiller.pod = nil     // reset state
+	diskGC.imageGCInvoked = false     // reset state
+	diskGC.containerGCInvoked = false // reset state
+	podKiller.pod = nil               // reset state
 	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure (because transition period not yet met)
@ -860,7 +872,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// no image gc should have occurred
-	if imageGC.invoked {
+	if diskGC.imageGCInvoked || diskGC.containerGCInvoked {
 		t.Errorf("Manager chose to perform image gc when it was not neeed")
 	}

@ -872,8 +884,9 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// move the clock past transition period to ensure that we stop reporting pressure
 	fakeClock.Step(5 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
-	imageGC.invoked = false // reset state
-	podKiller.pod = nil     // reset state
+	diskGC.imageGCInvoked = false     // reset state
+	diskGC.containerGCInvoked = false // reset state
+	podKiller.pod = nil               // reset state
 	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure (because transition period met)
@ -882,7 +895,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// no image gc should have occurred
-	if imageGC.invoked {
+	if diskGC.imageGCInvoked || diskGC.containerGCInvoked {
 		t.Errorf("Manager chose to perform image gc when it was not neeed")
 	}

@ -943,7 +956,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -971,7 +984,8 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -1144,7 +1158,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{
 		Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
 	}
@ -1174,7 +1188,8 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -1276,7 +1291,7 @@ func TestAllocatableMemoryPressure(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -1296,7 +1311,8 @@ func TestAllocatableMemoryPressure(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@ -1019,32 +1019,34 @@ func PodIsEvicted(podStatus v1.PodStatus) bool {
 }

 // buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources.
-func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs {
+func buildResourceToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs {
 	resourceToReclaimFunc := map[v1.ResourceName]nodeReclaimFuncs{}
 	// usage of an imagefs is optional
 	if withImageFs {
 		// with an imagefs, nodefs pressure should just delete logs
-		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
-		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()}
+		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{}
+		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{}
 		// with an imagefs, imagefs pressure should delete unused images
-		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)}
-		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)}
+		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, false)}
 	} else {
 		// without an imagefs, nodefs pressure should delete logs, and unused images
 		// since imagefs and nodefs share a common device, they share common reclaim functions
-		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
-		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
-		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
-		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
+		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, false)}
+		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, false)}
 	}
 	return resourceToReclaimFunc
 }

-// deleteLogs will delete logs to free up disk pressure.
-func deleteLogs() nodeReclaimFunc {
+// deleteTerminatedContainers will delete terminated containers to free up disk pressure.
+func deleteTerminatedContainers(containerGC ContainerGC) nodeReclaimFunc {
 	return func() (*resource.Quantity, error) {
-		// TODO: not yet supported.
-		return resource.NewQuantity(int64(0), resource.BinarySI), nil
+		glog.Infof("eviction manager: attempting to delete unused containers")
+		err := containerGC.DeleteAllUnusedContainers()
+		// Calculating bytes freed is not yet supported.
+		return resource.NewQuantity(int64(0), resource.BinarySI), err
 	}
 }

--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -81,6 +81,13 @@ type ImageGC interface {
 	DeleteUnusedImages() (int64, error)
 }

+// ContainerGC is responsible for performing garbage collection of unused containers.
+type ContainerGC interface {
+	// DeleteAllUnusedContainers deletes all unused containers, even those that belong to pods that are terminated, but not deleted.
+	// It returns an error if it is unsuccessful.
+	DeleteAllUnusedContainers() error
+}
+
 // KillPodFunc kills a pod.
 // The pod status is updated, and then it is killed with the specified grace period.
 // This function must block until either the pod is killed or an error is encountered.
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -671,7 +671,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 	klet.updatePodCIDR(kubeCfg.PodCIDR)

 	// setup containerGC
-	containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy)
+	containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy, klet.sourcesReady)
 	if err != nil {
 		return nil, err
 	}
@ -761,7 +761,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 	klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs()

 	// setup eviction manager
-	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, kubeDeps.Recorder, nodeRef, klet.clock)
+	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock)

 	klet.evictionManager = evictionManager
 	klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
@ -1184,7 +1184,7 @@ func (kl *Kubelet) setupDataDirs() error {
 func (kl *Kubelet) StartGarbageCollection() {
 	loggedContainerGCFailure := false
 	go wait.Until(func() {
-		if err := kl.containerGC.GarbageCollect(kl.sourcesReady.AllReady()); err != nil {
+		if err := kl.containerGC.GarbageCollect(); err != nil {
 			glog.Errorf("Container garbage collection failed: %v", err)
 			kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error())
 			loggedContainerGCFailure = true
--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@ -270,7 +270,7 @@ func newTestKubeletWithImageList(
 		Namespace: "",
 	}
 	// setup eviction manager
-	evictionManager, evictionAdmitHandler := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers, fakeRecorder), kubelet.imageManager, fakeRecorder, nodeRef, kubelet.clock)
+	evictionManager, evictionAdmitHandler := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers, fakeRecorder), kubelet.imageManager, kubelet.containerGC, fakeRecorder, nodeRef, kubelet.clock)

 	kubelet.evictionManager = evictionManager
 	kubelet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
--- a/pkg/kubelet/kuberuntime/kuberuntime_gc.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_gc.go
@ -209,7 +209,7 @@ func (cgc *containerGC) evictableContainers(minAge time.Duration) (containersByE
 }

 // evict all containers that are evictable
-func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
+func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
 	// Separate containers by evict units.
 	evictUnits, err := cgc.evictableContainers(gcPolicy.MinAge)
 	if err != nil {
@ -219,7 +219,7 @@ func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy
 	// Remove deleted pod containers if all sources are ready.
 	if allSourcesReady {
 		for key, unit := range evictUnits {
-			if cgc.isPodDeleted(key.uid) {
+			if cgc.isPodDeleted(key.uid) || evictNonDeletedPods {
 				cgc.removeOldestN(unit, len(unit)) // Remove all.
 				delete(evictUnits, key)
 			}
@ -261,7 +261,7 @@ func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy
 //   2. contains no containers.
 //   3. belong to a non-existent (i.e., already removed) pod, or is not the
 //      most recently created sandbox for the pod.
-func (cgc *containerGC) evictSandboxes() error {
+func (cgc *containerGC) evictSandboxes(evictNonDeletedPods bool) error {
 	containers, err := cgc.manager.getKubeletContainers(true)
 	if err != nil {
 		return err
@ -307,7 +307,7 @@ func (cgc *containerGC) evictSandboxes() error {
 	}

 	for podUID, sandboxes := range sandboxesByPod {
-		if cgc.isPodDeleted(podUID) {
+		if cgc.isPodDeleted(podUID) || evictNonDeletedPods {
 			// Remove all evictable sandboxes if the pod has been removed.
 			// Note that the latest dead sandbox is also removed if there is
 			// already an active one.
@ -367,14 +367,14 @@ func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error {
 // * removes oldest dead containers by enforcing gcPolicy.MaxContainers.
 // * gets evictable sandboxes which are not ready and contains no containers.
 // * removes evictable sandboxes.
-func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
+func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
 	// Remove evictable containers
-	if err := cgc.evictContainers(gcPolicy, allSourcesReady); err != nil {
+	if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil {
 		return err
 	}

 	// Remove sandboxes with zero containers
-	if err := cgc.evictSandboxes(); err != nil {
+	if err := cgc.evictSandboxes(evictNonDeletedPods); err != nil {
 		return err
 	}

--- a/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go
@ -62,27 +62,30 @@ func TestSandboxGC(t *testing.T) {
 	}

 	for c, test := range []struct {
-		description string              // description of the test case
-		sandboxes   []sandboxTemplate   // templates of sandboxes
-		containers  []containerTemplate // templates of containers
-		minAge      time.Duration       // sandboxMinGCAge
-		remain      []int               // template indexes of remaining sandboxes
+		description         string              // description of the test case
+		sandboxes           []sandboxTemplate   // templates of sandboxes
+		containers          []containerTemplate // templates of containers
+		minAge              time.Duration       // sandboxMinGCAge
+		remain              []int               // template indexes of remaining sandboxes
+		evictNonDeletedPods bool
 	}{
 		{
 			description: "notready sandboxes without containers for deleted pods should be garbage collected.",
 			sandboxes: []sandboxTemplate{
 				makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, 0),
 			},
-			containers: []containerTemplate{},
-			remain:     []int{},
+			containers:          []containerTemplate{},
+			remain:              []int{},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "ready sandboxes without containers for deleted pods should not be garbage collected.",
 			sandboxes: []sandboxTemplate{
 				makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_READY, false, 0),
 			},
-			containers: []containerTemplate{},
-			remain:     []int{0},
+			containers:          []containerTemplate{},
+			remain:              []int{0},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "sandboxes for existing pods should not be garbage collected.",
@ -90,8 +93,19 @@ func TestSandboxGC(t *testing.T) {
 				makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, 0),
 				makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0),
 			},
-			containers: []containerTemplate{},
-			remain:     []int{0, 1},
+			containers:          []containerTemplate{},
+			remain:              []int{0, 1},
+			evictNonDeletedPods: false,
+		},
+		{
+			description: "non-running sandboxes for existing pods should be garbage collected if evictNonDeletedPods is set.",
+			sandboxes: []sandboxTemplate{
+				makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, 0),
+				makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0),
+			},
+			containers:          []containerTemplate{},
+			remain:              []int{0},
+			evictNonDeletedPods: true,
 		},
 		{
 			description: "sandbox with containers should not be garbage collected.",
@ -101,7 +115,8 @@ func TestSandboxGC(t *testing.T) {
 			containers: []containerTemplate{
 				{pod: pods[0], container: &pods[0].Spec.Containers[0], state: runtimeapi.ContainerState_CONTAINER_EXITED},
 			},
-			remain: []int{0},
+			remain:              []int{0},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "multiple sandboxes should be handled properly.",
@ -120,7 +135,8 @@ func TestSandboxGC(t *testing.T) {
 			containers: []containerTemplate{
 				{pod: pods[1], container: &pods[1].Spec.Containers[0], sandboxAttempt: 1, state: runtimeapi.ContainerState_CONTAINER_EXITED},
 			},
-			remain: []int{0, 2},
+			remain:              []int{0, 2},
+			evictNonDeletedPods: false,
 		},
 	} {
 		t.Logf("TestCase #%d: %+v", c, test)
@ -129,7 +145,7 @@ func TestSandboxGC(t *testing.T) {
 		fakeRuntime.SetFakeSandboxes(fakeSandboxes)
 		fakeRuntime.SetFakeContainers(fakeContainers)

-		err := m.containerGC.evictSandboxes()
+		err := m.containerGC.evictSandboxes(test.evictNonDeletedPods)
 		assert.NoError(t, err)
 		realRemain, err := fakeRuntime.ListPodSandbox(nil)
 		assert.NoError(t, err)
@ -165,18 +181,20 @@ func TestContainerGC(t *testing.T) {
 	defaultGCPolicy := kubecontainer.ContainerGCPolicy{MinAge: time.Hour, MaxPerPodContainer: 2, MaxContainers: 6}

 	for c, test := range []struct {
-		description string                           // description of the test case
-		containers  []containerTemplate              // templates of containers
-		policy      *kubecontainer.ContainerGCPolicy // container gc policy
-		remain      []int                            // template indexes of remaining containers
+		description         string                           // description of the test case
+		containers          []containerTemplate              // templates of containers
+		policy              *kubecontainer.ContainerGCPolicy // container gc policy
+		remain              []int                            // template indexes of remaining containers
+		evictNonDeletedPods bool
 	}{
 		{
 			description: "all containers should be removed when max container limit is 0",
 			containers: []containerTemplate{
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			policy: &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: 1, MaxContainers: 0},
-			remain: []int{},
+			policy:              &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: 1, MaxContainers: 0},
+			remain:              []int{},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "max containers should be complied when no max per pod container limit is set",
@ -187,8 +205,9 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			policy: &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: 4},
-			remain: []int{0, 1, 2, 3},
+			policy:              &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: 4},
+			remain:              []int{0, 1, 2, 3},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "no containers should be removed if both max container and per pod container limits are not set",
@ -197,8 +216,9 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			policy: &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: -1},
-			remain: []int{0, 1, 2},
+			policy:              &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: -1},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "recently started containers should not be removed",
@ -207,7 +227,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1, 2},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "oldest containers should be removed when per pod container limit exceeded",
@ -216,7 +237,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1},
+			remain:              []int{0, 1},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "running containers should not be removed",
@ -225,7 +247,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING),
 			},
-			remain: []int{0, 1, 2},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "no containers should be removed when limits are not exceeded",
@ -233,7 +256,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1},
+			remain:              []int{0, 1},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "max container count should apply per (UID, container) pair",
@ -248,7 +272,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo2", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo2", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1, 3, 4, 6, 7},
+			remain:              []int{0, 1, 3, 4, 6, 7},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "max limit should apply and try to keep from every pod",
@ -264,7 +289,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo4", "bar4", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo4", "bar4", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 2, 4, 6, 8},
+			remain:              []int{0, 2, 4, 6, 8},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "oldest pods should be removed if limit exceeded",
@ -280,7 +306,21 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo6", "bar6", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo7", "bar7", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 2, 4, 6, 8, 9},
+			remain:              []int{0, 2, 4, 6, 8, 9},
+			evictNonDeletedPods: false,
+		},
+		{
+			description: "all non-running containers should be removed when evictNonDeletedPods is set",
+			containers: []containerTemplate{
+				makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo1", "bar1", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo2", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING),
+			},
+			remain:              []int{5},
+			evictNonDeletedPods: true,
 		},
 		{
 			description: "containers for deleted pods should be removed",
@ -292,7 +332,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("deleted", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("deleted", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1, 2},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 	} {
 		t.Logf("TestCase #%d: %+v", c, test)
@ -302,7 +343,7 @@ func TestContainerGC(t *testing.T) {
 		if test.policy == nil {
 			test.policy = &defaultGCPolicy
 		}
-		err := m.containerGC.evictContainers(*test.policy, true)
+		err := m.containerGC.evictContainers(*test.policy, true, test.evictNonDeletedPods)
 		assert.NoError(t, err)
 		realRemain, err := fakeRuntime.ListContainers(nil)
 		assert.NoError(t, err)
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@ -877,8 +877,8 @@ func (m *kubeGenericRuntimeManager) GetNetNS(_ kubecontainer.ContainerID) (strin
 }

 // GarbageCollect removes dead containers using the specified container gc policy.
-func (m *kubeGenericRuntimeManager) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
-	return m.containerGC.GarbageCollect(gcPolicy, allSourcesReady)
+func (m *kubeGenericRuntimeManager) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
+	return m.containerGC.GarbageCollect(gcPolicy, allSourcesReady, evictNonDeletedPods)
 }

 // GetPodContainerID gets pod sandbox ID
--- a/pkg/kubelet/rkt/rkt.go
+++ b/pkg/kubelet/rkt/rkt.go
@ -1955,7 +1955,7 @@ func (r *Runtime) getPodSystemdServiceFiles() ([]os.FileInfo, error) {
 // - If the number of containers exceeds gcPolicy.MaxContainers,
 //   then containers whose ages are older than gcPolicy.minAge will
 //   be removed.
-func (r *Runtime) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
+func (r *Runtime) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, _ bool) error {
 	var errlist []error
 	var totalInactiveContainers int
 	var inactivePods []*rktapi.Pod
--- a/pkg/kubelet/rkt/rkt_test.go
+++ b/pkg/kubelet/rkt/rkt_test.go
@ -1831,7 +1831,8 @@ func TestGarbageCollect(t *testing.T) {
 		}

 		allSourcesReady := true
-		err := rkt.GarbageCollect(tt.gcPolicy, allSourcesReady)
+		evictNonDeletedPods := false
+		err := rkt.GarbageCollect(tt.gcPolicy, allSourcesReady, evictNonDeletedPods)
 		assert.NoError(t, err, testCaseHint)

 		sort.Sort(sortedStringList(tt.expectedCommands))
--- a/pkg/kubelet/runonce_test.go
+++ b/pkg/kubelet/runonce_test.go
@ -126,7 +126,7 @@ func TestRunOnce(t *testing.T) {
 	fakeKillPodFunc := func(pod *v1.Pod, podStatus v1.PodStatus, gracePeriodOverride *int64) error {
 		return nil
 	}
-	evictionManager, evictionAdmitHandler := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, nil, kb.recorder, nodeRef, kb.clock)
+	evictionManager, evictionAdmitHandler := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, nil, nil, kb.recorder, nodeRef, kb.clock)

 	kb.evictionManager = evictionManager
 	kb.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)