From a8ea93636487d0bb60cac2613078134ccce19176 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Wed, 15 Nov 2023 15:15:54 -0500 Subject: [PATCH] image gc: don't start until max age has passed since kubelet started Signed-off-by: Peter Hunt --- pkg/kubelet/images/image_gc_manager.go | 14 ++++++++++---- pkg/kubelet/images/image_gc_manager_test.go | 19 +++++++++++-------- pkg/kubelet/kubelet.go | 3 ++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pkg/kubelet/images/image_gc_manager.go b/pkg/kubelet/images/image_gc_manager.go index 7e58b012a30..dca79bddca0 100644 --- a/pkg/kubelet/images/image_gc_manager.go +++ b/pkg/kubelet/images/image_gc_manager.go @@ -62,7 +62,7 @@ type StatsProvider interface { type ImageGCManager interface { // Applies the garbage collection policy. Errors include being unable to free // enough space as per the garbage collection policy. - GarbageCollect(ctx context.Context) error + GarbageCollect(ctx context.Context, beganGC time.Time) error // Start async garbage collection of images. Start() @@ -301,7 +301,7 @@ func (im *realImageGCManager) detectImages(ctx context.Context, detectTime time. return imagesInUse, nil } -func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { +func (im *realImageGCManager) GarbageCollect(ctx context.Context, beganGC time.Time) error { ctx, otelSpan := im.tracer.Start(ctx, "Images/GarbageCollect") defer otelSpan.End() @@ -311,7 +311,7 @@ func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { return err } - images, err = im.freeOldImages(ctx, images, freeTime) + images, err = im.freeOldImages(ctx, images, freeTime, beganGC) if err != nil { return err } @@ -362,10 +362,16 @@ func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { return nil } -func (im *realImageGCManager) freeOldImages(ctx context.Context, images []evictionInfo, freeTime time.Time) ([]evictionInfo, error) { +func (im *realImageGCManager) freeOldImages(ctx context.Context, images []evictionInfo, freeTime, beganGC time.Time) ([]evictionInfo, error) { if im.policy.MaxAge == 0 { return images, nil } + + // Wait until the MaxAge has passed since the Kubelet has started, + // or else we risk prematurely garbage collecting images. + if freeTime.Sub(beganGC) <= im.policy.MaxAge { + return images, nil + } var deletionErrors []error remainingImages := make([]evictionInfo, 0) for _, image := range images { diff --git a/pkg/kubelet/images/image_gc_manager_test.go b/pkg/kubelet/images/image_gc_manager_test.go index 4dc5816ade2..cf4f0f54afc 100644 --- a/pkg/kubelet/images/image_gc_manager_test.go +++ b/pkg/kubelet/images/image_gc_manager_test.go @@ -659,7 +659,7 @@ func TestGarbageCollectBelowLowThreshold(t *testing.T) { } mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(imageStats, imageStats, nil) - assert.NoError(t, manager.GarbageCollect(ctx)) + assert.NoError(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectCadvisorFailure(t *testing.T) { @@ -674,7 +674,7 @@ func TestGarbageCollectCadvisorFailure(t *testing.T) { manager, _ := newRealImageGCManager(policy, mockStatsProvider) mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{}, &statsapi.FsStats{}, fmt.Errorf("error")) - assert.NotNil(t, manager.GarbageCollect(ctx)) + assert.NotNil(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectBelowSuccess(t *testing.T) { @@ -699,7 +699,7 @@ func TestGarbageCollectBelowSuccess(t *testing.T) { makeImage(0, 450), } - assert.NoError(t, manager.GarbageCollect(ctx)) + assert.NoError(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectNotEnoughFreed(t *testing.T) { @@ -723,7 +723,7 @@ func TestGarbageCollectNotEnoughFreed(t *testing.T) { makeImage(0, 50), } - assert.NotNil(t, manager.GarbageCollect(ctx)) + assert.NotNil(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectImageNotOldEnough(t *testing.T) { @@ -824,14 +824,15 @@ func TestGarbageCollectImageTooOld(t *testing.T) { // First GC round should not GC remaining image, as it was used too recently. assert := assert.New(t) - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + oldStartTime := fakeClock.Now() + images, err = manager.freeOldImages(ctx, images, oldStartTime, oldStartTime) require.NoError(t, err) assert.Len(images, 1) assert.Len(fakeRuntime.ImageList, 2) // move clock by a millisecond past maxAge duration, then 1 image will be garbage collected fakeClock.Step(policy.MaxAge + 1) - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + images, err = manager.freeOldImages(ctx, images, fakeClock.Now(), oldStartTime) require.NoError(t, err) assert.Len(images, 0) assert.Len(fakeRuntime.ImageList, 1) @@ -879,8 +880,10 @@ func TestGarbageCollectImageMaxAgeDisabled(t *testing.T) { require.Equal(t, len(images), 1) assert.Len(fakeRuntime.ImageList, 2) + oldStartTime := fakeClock.Now() + // First GC round should not GC remaining image, as it was used too recently. - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + images, err = manager.freeOldImages(ctx, images, oldStartTime, oldStartTime) require.NoError(t, err) assert.Len(images, 1) assert.Len(fakeRuntime.ImageList, 2) @@ -888,7 +891,7 @@ func TestGarbageCollectImageMaxAgeDisabled(t *testing.T) { // Move clock by a lot, and the images should continue to not be garbage colleced // See https://stackoverflow.com/questions/25065055/what-is-the-maximum-time-time-in-go fakeClock.SetTime(time.Unix(1<<63-62135596801, 999999999)) - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + images, err = manager.freeOldImages(ctx, images, fakeClock.Now(), oldStartTime) require.NoError(t, err) assert.Len(images, 1) assert.Len(fakeRuntime.ImageList, 2) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index dbf8f554c07..ae5b5890a24 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1440,9 +1440,10 @@ func (kl *Kubelet) StartGarbageCollection() { } prevImageGCFailed := false + beganGC := time.Now() go wait.Until(func() { ctx := context.Background() - if err := kl.imageManager.GarbageCollect(ctx); err != nil { + if err := kl.imageManager.GarbageCollect(ctx, beganGC); err != nil { if prevImageGCFailed { klog.ErrorS(err, "Image garbage collection failed multiple times in a row") // Only create an event for repeated failures