trigger aggressive container garbage collection when under disk pressure

2025-07-23 11:50:44 +00:00 · 2017-05-22 11:00:22 -07:00 · 2017-05-22 11:00:22 -07:00 · 889afa5e2d
commit 889afa5e2d
parent 07f85565a2
16 changed files with 193 additions and 105 deletions
--- a/pkg/kubelet/container/container_gc.go
+++ b/pkg/kubelet/container/container_gc.go
@ -39,7 +39,15 @@ type ContainerGCPolicy struct {
 // Implementation is thread-compatible.
 type ContainerGC interface {
 	// Garbage collect containers.
-	GarbageCollect(allSourcesReady bool) error
+	GarbageCollect() error
+	// Deletes all unused containers, including containers belonging to pods that are terminated but not deleted
+	DeleteAllUnusedContainers() error
+}
+
+// SourcesReadyProvider knows how to determine if configuration sources are ready
+type SourcesReadyProvider interface {
+	// AllReady returns true if the currently configured sources have all been seen.
+	AllReady() bool
 }

 // TODO(vmarmol): Preferentially remove pod infra containers.
@ -49,20 +57,28 @@ type realContainerGC struct {

 	// Policy for garbage collection.
 	policy ContainerGCPolicy
+
+	// sourcesReadyProvider provides the readyness of kubelet configuration sources.
+	sourcesReadyProvider SourcesReadyProvider
 }

 // New ContainerGC instance with the specified policy.
-func NewContainerGC(runtime Runtime, policy ContainerGCPolicy) (ContainerGC, error) {
+func NewContainerGC(runtime Runtime, policy ContainerGCPolicy, sourcesReadyProvider SourcesReadyProvider) (ContainerGC, error) {
 	if policy.MinAge < 0 {
 		return nil, fmt.Errorf("invalid minimum garbage collection age: %v", policy.MinAge)
 	}

 	return &realContainerGC{
-		runtime: runtime,
-		policy:  policy,
+		runtime:              runtime,
+		policy:               policy,
+		sourcesReadyProvider: sourcesReadyProvider,
 	}, nil
 }

-func (cgc *realContainerGC) GarbageCollect(allSourcesReady bool) error {
-	return cgc.runtime.GarbageCollect(cgc.policy, allSourcesReady)
+func (cgc *realContainerGC) GarbageCollect() error {
+	return cgc.runtime.GarbageCollect(cgc.policy, cgc.sourcesReadyProvider.AllReady(), false)
+}
+
+func (cgc *realContainerGC) DeleteAllUnusedContainers() error {
+	return cgc.runtime.GarbageCollect(cgc.policy, cgc.sourcesReadyProvider.AllReady(), true)
 }
--- a/pkg/kubelet/container/runtime.go
+++ b/pkg/kubelet/container/runtime.go
@ -82,8 +82,10 @@ type Runtime interface {
 	// complete list of pods from all avialble sources (e.g., apiserver, http,
 	// file). In this case, garbage collector should refrain itself from aggressive
 	// behavior such as removing all containers of unrecognized pods (yet).
+	// If evictNonDeletedPods is set to true, containers and sandboxes belonging to pods
+	// that are terminated, but not deleted will be evicted.  Otherwise, only deleted pods will be GC'd.
 	// TODO: Revisit this method and make it cleaner.
-	GarbageCollect(gcPolicy ContainerGCPolicy, allSourcesReady bool) error
+	GarbageCollect(gcPolicy ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error
 	// Syncs the running pod into the desired pod.
 	SyncPod(pod *v1.Pod, apiPodStatus v1.PodStatus, podStatus *PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) PodSyncResult
 	// KillPod kills all the containers of a pod. Pod may be nil, running pod must not be.
--- a/pkg/kubelet/container/testing/fake_runtime.go
+++ b/pkg/kubelet/container/testing/fake_runtime.go
@ -431,7 +431,7 @@ func (f *FakeRuntime) GetPodContainerID(pod *Pod) (ContainerID, error) {
 	return ContainerID{}, f.Err
 }

-func (f *FakeRuntime) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool) error {
+func (f *FakeRuntime) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool, evictNonDeletedPods bool) error {
 	f.Lock()
 	defer f.Unlock()

--- a/pkg/kubelet/container/testing/runtime_mock.go
+++ b/pkg/kubelet/container/testing/runtime_mock.go
@ -140,8 +140,8 @@ func (r *Mock) GetPodContainerID(pod *Pod) (ContainerID, error) {
 	return ContainerID{}, args.Error(0)
 }

-func (r *Mock) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool) error {
-	args := r.Called(gcPolicy, ready)
+func (r *Mock) GarbageCollect(gcPolicy ContainerGCPolicy, ready bool, evictNonDeletedPods bool) error {
+	args := r.Called(gcPolicy, ready, evictNonDeletedPods)
 	return args.Error(0)
 }

--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -58,6 +58,8 @@ type managerImpl struct {
 	killPodFunc KillPodFunc
 	// the interface that knows how to do image gc
 	imageGC ImageGC
+	// the interface that knows how to do image gc
+	containerGC ContainerGC
 	// protects access to internal state
 	sync.RWMutex
 	// node conditions are the set of conditions present
@ -95,6 +97,7 @@ func NewManager(
 	config Config,
 	killPodFunc KillPodFunc,
 	imageGC ImageGC,
+	containerGC ContainerGC,
 	recorder record.EventRecorder,
 	nodeRef *clientv1.ObjectReference,
 	clock clock.Clock) (Manager, lifecycle.PodAdmitHandler) {
@ -102,6 +105,7 @@ func NewManager(
 		clock:           clock,
 		killPodFunc:     killPodFunc,
 		imageGC:         imageGC,
+		containerGC:     containerGC,
 		config:          config,
 		recorder:        recorder,
 		summaryProvider: summaryProvider,
@ -223,8 +227,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		}
 		m.dedicatedImageFs = &hasImageFs
 		m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
-		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasImageFs)
-
+		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
 	}

 	activePods := podFunc()
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@ -77,17 +77,24 @@ func (m *mockNodeProvider) GetNode() (*v1.Node, error) {
 	return &m.node, nil
 }

-// mockImageGC is used to simulate invoking image garbage collection.
-type mockImageGC struct {
-	err     error
-	freed   int64
-	invoked bool
+// mockDiskGC is used to simulate invoking image and container garbage collection.
+type mockDiskGC struct {
+	err                error
+	imageBytesFreed    int64
+	imageGCInvoked     bool
+	containerGCInvoked bool
 }

 // DeleteUnusedImages returns the mocked values.
-func (m *mockImageGC) DeleteUnusedImages() (int64, error) {
-	m.invoked = true
-	return m.freed, m.err
+func (m *mockDiskGC) DeleteUnusedImages() (int64, error) {
+	m.imageGCInvoked = true
+	return m.imageBytesFreed, m.err
+}
+
+// DeleteAllUnusedContainers returns the mocked value
+func (m *mockDiskGC) DeleteAllUnusedContainers() error {
+	m.containerGCInvoked = true
+	return m.err
 }

 func makePodWithMemoryStats(name string, requests v1.ResourceList, limits v1.ResourceList, memoryWorkingSet string) (*v1.Pod, statsapi.PodStats) {
@ -194,7 +201,7 @@ func TestMemoryPressure(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	imageGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -412,7 +419,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -440,7 +447,8 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -610,7 +618,7 @@ func TestMinReclaim(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -633,7 +641,8 @@ func TestMinReclaim(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -750,7 +759,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGcFree := resource.MustParse("700Mi")
-	imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: imageGcFree.Value(), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -773,7 +782,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -801,7 +811,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// verify image gc was invoked
-	if !imageGC.invoked {
+	if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
 		t.Errorf("Manager should have invoked image gc")
 	}

@ -811,7 +821,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// reset state
-	imageGC.invoked = false
+	diskGC.imageGCInvoked = false
+	diskGC.containerGCInvoked = false

 	// remove disk pressure
 	fakeClock.Step(20 * time.Minute)
@ -833,8 +844,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 		t.Errorf("Manager should report disk pressure")
 	}

-	// ensure image gc was invoked
-	if !imageGC.invoked {
+	// ensure disk gc was invoked
+	if !diskGC.imageGCInvoked || !diskGC.containerGCInvoked {
 		t.Errorf("Manager should have invoked image gc")
 	}

@ -850,8 +861,9 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// reduce disk pressure
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
-	imageGC.invoked = false // reset state
-	podKiller.pod = nil     // reset state
+	diskGC.imageGCInvoked = false     // reset state
+	diskGC.containerGCInvoked = false // reset state
+	podKiller.pod = nil               // reset state
 	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure (because transition period not yet met)
@ -860,7 +872,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// no image gc should have occurred
-	if imageGC.invoked {
+	if diskGC.imageGCInvoked || diskGC.containerGCInvoked {
 		t.Errorf("Manager chose to perform image gc when it was not neeed")
 	}

@ -872,8 +884,9 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// move the clock past transition period to ensure that we stop reporting pressure
 	fakeClock.Step(5 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
-	imageGC.invoked = false // reset state
-	podKiller.pod = nil     // reset state
+	diskGC.imageGCInvoked = false     // reset state
+	diskGC.containerGCInvoked = false // reset state
+	podKiller.pod = nil               // reset state
 	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure (because transition period met)
@ -882,7 +895,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// no image gc should have occurred
-	if imageGC.invoked {
+	if diskGC.imageGCInvoked || diskGC.containerGCInvoked {
 		t.Errorf("Manager chose to perform image gc when it was not neeed")
 	}

@ -943,7 +956,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -971,7 +984,8 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -1144,7 +1158,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{
 		Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
 	}
@ -1174,7 +1188,8 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
@ -1276,7 +1291,7 @@ func TestAllocatableMemoryPressure(t *testing.T) {
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
 	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
-	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	diskGC := &mockDiskGC{imageBytesFreed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

 	config := Config{
@ -1296,7 +1311,8 @@ func TestAllocatableMemoryPressure(t *testing.T) {
 	manager := &managerImpl{
 		clock:           fakeClock,
 		killPodFunc:     podKiller.killPodNow,
-		imageGC:         imageGC,
+		imageGC:         diskGC,
+		containerGC:     diskGC,
 		config:          config,
 		recorder:        &record.FakeRecorder{},
 		summaryProvider: summaryProvider,
--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@ -1019,32 +1019,34 @@ func PodIsEvicted(podStatus v1.PodStatus) bool {
 }

 // buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources.
-func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs {
+func buildResourceToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs {
 	resourceToReclaimFunc := map[v1.ResourceName]nodeReclaimFuncs{}
 	// usage of an imagefs is optional
 	if withImageFs {
 		// with an imagefs, nodefs pressure should just delete logs
-		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
-		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()}
+		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{}
+		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{}
 		// with an imagefs, imagefs pressure should delete unused images
-		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)}
-		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)}
+		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, false)}
 	} else {
 		// without an imagefs, nodefs pressure should delete logs, and unused images
 		// since imagefs and nodefs share a common device, they share common reclaim functions
-		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
-		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
-		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
-		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
+		resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, false)}
+		resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, true)}
+		resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteTerminatedContainers(containerGC), deleteImages(imageGC, false)}
 	}
 	return resourceToReclaimFunc
 }

-// deleteLogs will delete logs to free up disk pressure.
-func deleteLogs() nodeReclaimFunc {
+// deleteTerminatedContainers will delete terminated containers to free up disk pressure.
+func deleteTerminatedContainers(containerGC ContainerGC) nodeReclaimFunc {
 	return func() (*resource.Quantity, error) {
-		// TODO: not yet supported.
-		return resource.NewQuantity(int64(0), resource.BinarySI), nil
+		glog.Infof("eviction manager: attempting to delete unused containers")
+		err := containerGC.DeleteAllUnusedContainers()
+		// Calculating bytes freed is not yet supported.
+		return resource.NewQuantity(int64(0), resource.BinarySI), err
 	}
 }

--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -81,6 +81,13 @@ type ImageGC interface {
 	DeleteUnusedImages() (int64, error)
 }

+// ContainerGC is responsible for performing garbage collection of unused containers.
+type ContainerGC interface {
+	// DeleteAllUnusedContainers deletes all unused containers, even those that belong to pods that are terminated, but not deleted.
+	// It returns an error if it is unsuccessful.
+	DeleteAllUnusedContainers() error
+}
+
 // KillPodFunc kills a pod.
 // The pod status is updated, and then it is killed with the specified grace period.
 // This function must block until either the pod is killed or an error is encountered.
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -671,7 +671,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 	klet.updatePodCIDR(kubeCfg.PodCIDR)

 	// setup containerGC
-	containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy)
+	containerGC, err := kubecontainer.NewContainerGC(klet.containerRuntime, containerGCPolicy, klet.sourcesReady)
 	if err != nil {
 		return nil, err
 	}
@ -761,7 +761,7 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 	klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs()

 	// setup eviction manager
-	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, kubeDeps.Recorder, nodeRef, klet.clock)
+	evictionManager, evictionAdmitHandler := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers, kubeDeps.Recorder), klet.imageManager, klet.containerGC, kubeDeps.Recorder, nodeRef, klet.clock)

 	klet.evictionManager = evictionManager
 	klet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
@ -1184,7 +1184,7 @@ func (kl *Kubelet) setupDataDirs() error {
 func (kl *Kubelet) StartGarbageCollection() {
 	loggedContainerGCFailure := false
 	go wait.Until(func() {
-		if err := kl.containerGC.GarbageCollect(kl.sourcesReady.AllReady()); err != nil {
+		if err := kl.containerGC.GarbageCollect(); err != nil {
 			glog.Errorf("Container garbage collection failed: %v", err)
 			kl.recorder.Eventf(kl.nodeRef, v1.EventTypeWarning, events.ContainerGCFailed, err.Error())
 			loggedContainerGCFailure = true
--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@ -270,7 +270,7 @@ func newTestKubeletWithImageList(
 		Namespace: "",
 	}
 	// setup eviction manager
-	evictionManager, evictionAdmitHandler := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers, fakeRecorder), kubelet.imageManager, fakeRecorder, nodeRef, kubelet.clock)
+	evictionManager, evictionAdmitHandler := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers, fakeRecorder), kubelet.imageManager, kubelet.containerGC, fakeRecorder, nodeRef, kubelet.clock)

 	kubelet.evictionManager = evictionManager
 	kubelet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
--- a/pkg/kubelet/kuberuntime/kuberuntime_gc.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_gc.go
@ -209,7 +209,7 @@ func (cgc *containerGC) evictableContainers(minAge time.Duration) (containersByE
 }

 // evict all containers that are evictable
-func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
+func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
 	// Separate containers by evict units.
 	evictUnits, err := cgc.evictableContainers(gcPolicy.MinAge)
 	if err != nil {
@ -219,7 +219,7 @@ func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy
 	// Remove deleted pod containers if all sources are ready.
 	if allSourcesReady {
 		for key, unit := range evictUnits {
-			if cgc.isPodDeleted(key.uid) {
+			if cgc.isPodDeleted(key.uid) || evictNonDeletedPods {
 				cgc.removeOldestN(unit, len(unit)) // Remove all.
 				delete(evictUnits, key)
 			}
@ -261,7 +261,7 @@ func (cgc *containerGC) evictContainers(gcPolicy kubecontainer.ContainerGCPolicy
 //   2. contains no containers.
 //   3. belong to a non-existent (i.e., already removed) pod, or is not the
 //      most recently created sandbox for the pod.
-func (cgc *containerGC) evictSandboxes() error {
+func (cgc *containerGC) evictSandboxes(evictNonDeletedPods bool) error {
 	containers, err := cgc.manager.getKubeletContainers(true)
 	if err != nil {
 		return err
@ -307,7 +307,7 @@ func (cgc *containerGC) evictSandboxes() error {
 	}

 	for podUID, sandboxes := range sandboxesByPod {
-		if cgc.isPodDeleted(podUID) {
+		if cgc.isPodDeleted(podUID) || evictNonDeletedPods {
 			// Remove all evictable sandboxes if the pod has been removed.
 			// Note that the latest dead sandbox is also removed if there is
 			// already an active one.
@ -367,14 +367,14 @@ func (cgc *containerGC) evictPodLogsDirectories(allSourcesReady bool) error {
 // * removes oldest dead containers by enforcing gcPolicy.MaxContainers.
 // * gets evictable sandboxes which are not ready and contains no containers.
 // * removes evictable sandboxes.
-func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
+func (cgc *containerGC) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
 	// Remove evictable containers
-	if err := cgc.evictContainers(gcPolicy, allSourcesReady); err != nil {
+	if err := cgc.evictContainers(gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil {
 		return err
 	}

 	// Remove sandboxes with zero containers
-	if err := cgc.evictSandboxes(); err != nil {
+	if err := cgc.evictSandboxes(evictNonDeletedPods); err != nil {
 		return err
 	}

--- a/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_gc_test.go
@ -62,27 +62,30 @@ func TestSandboxGC(t *testing.T) {
 	}

 	for c, test := range []struct {
-		description string              // description of the test case
-		sandboxes   []sandboxTemplate   // templates of sandboxes
-		containers  []containerTemplate // templates of containers
-		minAge      time.Duration       // sandboxMinGCAge
-		remain      []int               // template indexes of remaining sandboxes
+		description         string              // description of the test case
+		sandboxes           []sandboxTemplate   // templates of sandboxes
+		containers          []containerTemplate // templates of containers
+		minAge              time.Duration       // sandboxMinGCAge
+		remain              []int               // template indexes of remaining sandboxes
+		evictNonDeletedPods bool
 	}{
 		{
 			description: "notready sandboxes without containers for deleted pods should be garbage collected.",
 			sandboxes: []sandboxTemplate{
 				makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, false, 0),
 			},
-			containers: []containerTemplate{},
-			remain:     []int{},
+			containers:          []containerTemplate{},
+			remain:              []int{},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "ready sandboxes without containers for deleted pods should not be garbage collected.",
 			sandboxes: []sandboxTemplate{
 				makeGCSandbox(pods[2], 0, runtimeapi.PodSandboxState_SANDBOX_READY, false, 0),
 			},
-			containers: []containerTemplate{},
-			remain:     []int{0},
+			containers:          []containerTemplate{},
+			remain:              []int{0},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "sandboxes for existing pods should not be garbage collected.",
@ -90,8 +93,19 @@ func TestSandboxGC(t *testing.T) {
 				makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, 0),
 				makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0),
 			},
-			containers: []containerTemplate{},
-			remain:     []int{0, 1},
+			containers:          []containerTemplate{},
+			remain:              []int{0, 1},
+			evictNonDeletedPods: false,
+		},
+		{
+			description: "non-running sandboxes for existing pods should be garbage collected if evictNonDeletedPods is set.",
+			sandboxes: []sandboxTemplate{
+				makeGCSandbox(pods[0], 0, runtimeapi.PodSandboxState_SANDBOX_READY, true, 0),
+				makeGCSandbox(pods[1], 0, runtimeapi.PodSandboxState_SANDBOX_NOTREADY, true, 0),
+			},
+			containers:          []containerTemplate{},
+			remain:              []int{0},
+			evictNonDeletedPods: true,
 		},
 		{
 			description: "sandbox with containers should not be garbage collected.",
@ -101,7 +115,8 @@ func TestSandboxGC(t *testing.T) {
 			containers: []containerTemplate{
 				{pod: pods[0], container: &pods[0].Spec.Containers[0], state: runtimeapi.ContainerState_CONTAINER_EXITED},
 			},
-			remain: []int{0},
+			remain:              []int{0},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "multiple sandboxes should be handled properly.",
@ -120,7 +135,8 @@ func TestSandboxGC(t *testing.T) {
 			containers: []containerTemplate{
 				{pod: pods[1], container: &pods[1].Spec.Containers[0], sandboxAttempt: 1, state: runtimeapi.ContainerState_CONTAINER_EXITED},
 			},
-			remain: []int{0, 2},
+			remain:              []int{0, 2},
+			evictNonDeletedPods: false,
 		},
 	} {
 		t.Logf("TestCase #%d: %+v", c, test)
@ -129,7 +145,7 @@ func TestSandboxGC(t *testing.T) {
 		fakeRuntime.SetFakeSandboxes(fakeSandboxes)
 		fakeRuntime.SetFakeContainers(fakeContainers)

-		err := m.containerGC.evictSandboxes()
+		err := m.containerGC.evictSandboxes(test.evictNonDeletedPods)
 		assert.NoError(t, err)
 		realRemain, err := fakeRuntime.ListPodSandbox(nil)
 		assert.NoError(t, err)
@ -165,18 +181,20 @@ func TestContainerGC(t *testing.T) {
 	defaultGCPolicy := kubecontainer.ContainerGCPolicy{MinAge: time.Hour, MaxPerPodContainer: 2, MaxContainers: 6}

 	for c, test := range []struct {
-		description string                           // description of the test case
-		containers  []containerTemplate              // templates of containers
-		policy      *kubecontainer.ContainerGCPolicy // container gc policy
-		remain      []int                            // template indexes of remaining containers
+		description         string                           // description of the test case
+		containers          []containerTemplate              // templates of containers
+		policy              *kubecontainer.ContainerGCPolicy // container gc policy
+		remain              []int                            // template indexes of remaining containers
+		evictNonDeletedPods bool
 	}{
 		{
 			description: "all containers should be removed when max container limit is 0",
 			containers: []containerTemplate{
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			policy: &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: 1, MaxContainers: 0},
-			remain: []int{},
+			policy:              &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: 1, MaxContainers: 0},
+			remain:              []int{},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "max containers should be complied when no max per pod container limit is set",
@ -187,8 +205,9 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			policy: &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: 4},
-			remain: []int{0, 1, 2, 3},
+			policy:              &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: 4},
+			remain:              []int{0, 1, 2, 3},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "no containers should be removed if both max container and per pod container limits are not set",
@ -197,8 +216,9 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			policy: &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: -1},
-			remain: []int{0, 1, 2},
+			policy:              &kubecontainer.ContainerGCPolicy{MinAge: time.Minute, MaxPerPodContainer: -1, MaxContainers: -1},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "recently started containers should not be removed",
@ -207,7 +227,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, time.Now().UnixNano(), runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1, 2},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "oldest containers should be removed when per pod container limit exceeded",
@ -216,7 +237,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1},
+			remain:              []int{0, 1},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "running containers should not be removed",
@ -225,7 +247,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING),
 			},
-			remain: []int{0, 1, 2},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "no containers should be removed when limits are not exceeded",
@ -233,7 +256,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1},
+			remain:              []int{0, 1},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "max container count should apply per (UID, container) pair",
@ -248,7 +272,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo2", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo2", "bar", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1, 3, 4, 6, 7},
+			remain:              []int{0, 1, 3, 4, 6, 7},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "max limit should apply and try to keep from every pod",
@ -264,7 +289,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo4", "bar4", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo4", "bar4", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 2, 4, 6, 8},
+			remain:              []int{0, 2, 4, 6, 8},
+			evictNonDeletedPods: false,
 		},
 		{
 			description: "oldest pods should be removed if limit exceeded",
@ -280,7 +306,21 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("foo6", "bar6", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("foo7", "bar7", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 2, 4, 6, 8, 9},
+			remain:              []int{0, 2, 4, 6, 8, 9},
+			evictNonDeletedPods: false,
+		},
+		{
+			description: "all non-running containers should be removed when evictNonDeletedPods is set",
+			containers: []containerTemplate{
+				makeGCContainer("foo", "bar", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo", "bar", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo1", "bar1", 2, 2, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo1", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo2", "bar2", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
+				makeGCContainer("foo3", "bar3", 0, 0, runtimeapi.ContainerState_CONTAINER_RUNNING),
+			},
+			remain:              []int{5},
+			evictNonDeletedPods: true,
 		},
 		{
 			description: "containers for deleted pods should be removed",
@ -292,7 +332,8 @@ func TestContainerGC(t *testing.T) {
 				makeGCContainer("deleted", "bar1", 1, 1, runtimeapi.ContainerState_CONTAINER_EXITED),
 				makeGCContainer("deleted", "bar1", 0, 0, runtimeapi.ContainerState_CONTAINER_EXITED),
 			},
-			remain: []int{0, 1, 2},
+			remain:              []int{0, 1, 2},
+			evictNonDeletedPods: false,
 		},
 	} {
 		t.Logf("TestCase #%d: %+v", c, test)
@ -302,7 +343,7 @@ func TestContainerGC(t *testing.T) {
 		if test.policy == nil {
 			test.policy = &defaultGCPolicy
 		}
-		err := m.containerGC.evictContainers(*test.policy, true)
+		err := m.containerGC.evictContainers(*test.policy, true, test.evictNonDeletedPods)
 		assert.NoError(t, err)
 		realRemain, err := fakeRuntime.ListContainers(nil)
 		assert.NoError(t, err)
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@ -877,8 +877,8 @@ func (m *kubeGenericRuntimeManager) GetNetNS(_ kubecontainer.ContainerID) (strin
 }

 // GarbageCollect removes dead containers using the specified container gc policy.
-func (m *kubeGenericRuntimeManager) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
-	return m.containerGC.GarbageCollect(gcPolicy, allSourcesReady)
+func (m *kubeGenericRuntimeManager) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
+	return m.containerGC.GarbageCollect(gcPolicy, allSourcesReady, evictNonDeletedPods)
 }

 // GetPodContainerID gets pod sandbox ID
--- a/pkg/kubelet/rkt/rkt.go
+++ b/pkg/kubelet/rkt/rkt.go
@ -1955,7 +1955,7 @@ func (r *Runtime) getPodSystemdServiceFiles() ([]os.FileInfo, error) {
 // - If the number of containers exceeds gcPolicy.MaxContainers,
 //   then containers whose ages are older than gcPolicy.minAge will
 //   be removed.
-func (r *Runtime) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool) error {
+func (r *Runtime) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, _ bool) error {
 	var errlist []error
 	var totalInactiveContainers int
 	var inactivePods []*rktapi.Pod
--- a/pkg/kubelet/rkt/rkt_test.go
+++ b/pkg/kubelet/rkt/rkt_test.go
@ -1831,7 +1831,8 @@ func TestGarbageCollect(t *testing.T) {
 		}

 		allSourcesReady := true
-		err := rkt.GarbageCollect(tt.gcPolicy, allSourcesReady)
+		evictNonDeletedPods := false
+		err := rkt.GarbageCollect(tt.gcPolicy, allSourcesReady, evictNonDeletedPods)
 		assert.NoError(t, err, testCaseHint)

 		sort.Sort(sortedStringList(tt.expectedCommands))
--- a/pkg/kubelet/runonce_test.go
+++ b/pkg/kubelet/runonce_test.go
@ -126,7 +126,7 @@ func TestRunOnce(t *testing.T) {
 	fakeKillPodFunc := func(pod *v1.Pod, podStatus v1.PodStatus, gracePeriodOverride *int64) error {
 		return nil
 	}
-	evictionManager, evictionAdmitHandler := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, nil, kb.recorder, nodeRef, kb.clock)
+	evictionManager, evictionAdmitHandler := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, nil, nil, kb.recorder, nodeRef, kb.clock)

 	kb.evictionManager = evictionManager
 	kb.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)