From be7856e3402405fd18cd040b896f59d9ef7180cb Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 19 Feb 2024 14:30:47 -0500 Subject: [PATCH 1/5] e2e_node: factor out waitForKubeletToStart Signed-off-by: Peter Hunt --- test/e2e_node/util.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 680f0f3889f..2e9d07071d2 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -233,7 +233,10 @@ func updateKubeletConfig(ctx context.Context, f *framework.Framework, kubeletCon ginkgo.By("Starting the kubelet") startKubelet() + waitForKubeletToStart(ctx, f) +} +func waitForKubeletToStart(ctx context.Context, f *framework.Framework) { // wait until the kubelet health check will succeed gomega.Eventually(ctx, func() bool { return kubeletHealthCheck(kubeletHealthCheckURL) From 6cd78bc5fc50c634d104b010f379bcefbec8bb86 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 30 Oct 2023 11:38:06 -0400 Subject: [PATCH 2/5] node e2e: add image max gc test Signed-off-by: Peter Hunt --- test/e2e_node/image_gc_test.go | 78 ++++++++++++++++++++++++++++++++++ test/e2e_node/util.go | 1 + 2 files changed, 79 insertions(+) create mode 100644 test/e2e_node/image_gc_test.go diff --git a/test/e2e_node/image_gc_test.go b/test/e2e_node/image_gc_test.go new file mode 100644 index 00000000000..753cec63afd --- /dev/null +++ b/test/e2e_node/image_gc_test.go @@ -0,0 +1,78 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "context" + "time" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + internalapi "k8s.io/cri-api/pkg/apis" + runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" + kubefeatures "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + admissionapi "k8s.io/pod-security-admission/api" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" +) + +// var _ = SIGDescribe("ImageGarbageCollect [Serial][NodeFeature:GarbageCollect]", func() { +var _ = SIGDescribe("ImageGarbageCollect", func() { + f := framework.NewDefaultFramework("image-garbage-collect-test") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + var is internalapi.ImageManagerService + ginkgo.BeforeEach(func() { + var err error + _, is, err = getCRIClient() + framework.ExpectNoError(err) + }) + ginkgo.AfterEach(func() { + // framework.ExpectNoError(PrePullAllImages()) + }) + ginkgo.Context("when ImageMaximumGCAge is set", func() { + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.ImageMaximumGCAge = metav1.Duration{Duration: time.Duration(time.Second * 30)} + initialConfig.ImageMinimumGCAge = metav1.Duration{Duration: time.Duration(time.Second * 1)} + if initialConfig.FeatureGates == nil { + initialConfig.FeatureGates = make(map[string]bool) + } + initialConfig.FeatureGates[string(kubefeatures.ImageMaximumGCAge)] = true + }) + ginkgo.It("should GC unused images", func(ctx context.Context) { + pod := innocentPod() + e2epod.NewPodClient(f).CreateBatch(ctx, []*v1.Pod{pod}) + + _, err := is.PullImage(context.Background(), &runtimeapi.ImageSpec{Image: agnhostImage}, nil, nil) + framework.ExpectNoError(err) + + allImages, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) + framework.ExpectNoError(err) + + e2epod.NewPodClient(f).DeleteSync(ctx, pod.ObjectMeta.Name, metav1.DeleteOptions{}, e2epod.DefaultPodDeletionTimeout) + + time.Sleep(time.Minute * 2) + + // All images but one is unused, so expect them to be garbage collected + gcdImageList, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) + gomega.Expect(len(allImages)).To(gomega.BeNumerically(">", len(gcdImageList))) + }) + }) +}) diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 2e9d07071d2..27fdfba32f0 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -74,6 +74,7 @@ import ( var startServices = flag.Bool("start-services", true, "If true, start local node services") var stopServices = flag.Bool("stop-services", true, "If true, stop local node services after running tests") var busyboxImage = imageutils.GetE2EImage(imageutils.BusyBox) +var agnhostImage = imageutils.GetE2EImage(imageutils.Agnhost) const ( // Kubelet internal cgroup name for node allocatable cgroup. From a8ea93636487d0bb60cac2613078134ccce19176 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Wed, 15 Nov 2023 15:15:54 -0500 Subject: [PATCH 3/5] image gc: don't start until max age has passed since kubelet started Signed-off-by: Peter Hunt --- pkg/kubelet/images/image_gc_manager.go | 14 ++++++++++---- pkg/kubelet/images/image_gc_manager_test.go | 19 +++++++++++-------- pkg/kubelet/kubelet.go | 3 ++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pkg/kubelet/images/image_gc_manager.go b/pkg/kubelet/images/image_gc_manager.go index 7e58b012a30..dca79bddca0 100644 --- a/pkg/kubelet/images/image_gc_manager.go +++ b/pkg/kubelet/images/image_gc_manager.go @@ -62,7 +62,7 @@ type StatsProvider interface { type ImageGCManager interface { // Applies the garbage collection policy. Errors include being unable to free // enough space as per the garbage collection policy. - GarbageCollect(ctx context.Context) error + GarbageCollect(ctx context.Context, beganGC time.Time) error // Start async garbage collection of images. Start() @@ -301,7 +301,7 @@ func (im *realImageGCManager) detectImages(ctx context.Context, detectTime time. return imagesInUse, nil } -func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { +func (im *realImageGCManager) GarbageCollect(ctx context.Context, beganGC time.Time) error { ctx, otelSpan := im.tracer.Start(ctx, "Images/GarbageCollect") defer otelSpan.End() @@ -311,7 +311,7 @@ func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { return err } - images, err = im.freeOldImages(ctx, images, freeTime) + images, err = im.freeOldImages(ctx, images, freeTime, beganGC) if err != nil { return err } @@ -362,10 +362,16 @@ func (im *realImageGCManager) GarbageCollect(ctx context.Context) error { return nil } -func (im *realImageGCManager) freeOldImages(ctx context.Context, images []evictionInfo, freeTime time.Time) ([]evictionInfo, error) { +func (im *realImageGCManager) freeOldImages(ctx context.Context, images []evictionInfo, freeTime, beganGC time.Time) ([]evictionInfo, error) { if im.policy.MaxAge == 0 { return images, nil } + + // Wait until the MaxAge has passed since the Kubelet has started, + // or else we risk prematurely garbage collecting images. + if freeTime.Sub(beganGC) <= im.policy.MaxAge { + return images, nil + } var deletionErrors []error remainingImages := make([]evictionInfo, 0) for _, image := range images { diff --git a/pkg/kubelet/images/image_gc_manager_test.go b/pkg/kubelet/images/image_gc_manager_test.go index 4dc5816ade2..cf4f0f54afc 100644 --- a/pkg/kubelet/images/image_gc_manager_test.go +++ b/pkg/kubelet/images/image_gc_manager_test.go @@ -659,7 +659,7 @@ func TestGarbageCollectBelowLowThreshold(t *testing.T) { } mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(imageStats, imageStats, nil) - assert.NoError(t, manager.GarbageCollect(ctx)) + assert.NoError(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectCadvisorFailure(t *testing.T) { @@ -674,7 +674,7 @@ func TestGarbageCollectCadvisorFailure(t *testing.T) { manager, _ := newRealImageGCManager(policy, mockStatsProvider) mockStatsProvider.EXPECT().ImageFsStats(gomock.Any()).Return(&statsapi.FsStats{}, &statsapi.FsStats{}, fmt.Errorf("error")) - assert.NotNil(t, manager.GarbageCollect(ctx)) + assert.NotNil(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectBelowSuccess(t *testing.T) { @@ -699,7 +699,7 @@ func TestGarbageCollectBelowSuccess(t *testing.T) { makeImage(0, 450), } - assert.NoError(t, manager.GarbageCollect(ctx)) + assert.NoError(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectNotEnoughFreed(t *testing.T) { @@ -723,7 +723,7 @@ func TestGarbageCollectNotEnoughFreed(t *testing.T) { makeImage(0, 50), } - assert.NotNil(t, manager.GarbageCollect(ctx)) + assert.NotNil(t, manager.GarbageCollect(ctx, time.Now())) } func TestGarbageCollectImageNotOldEnough(t *testing.T) { @@ -824,14 +824,15 @@ func TestGarbageCollectImageTooOld(t *testing.T) { // First GC round should not GC remaining image, as it was used too recently. assert := assert.New(t) - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + oldStartTime := fakeClock.Now() + images, err = manager.freeOldImages(ctx, images, oldStartTime, oldStartTime) require.NoError(t, err) assert.Len(images, 1) assert.Len(fakeRuntime.ImageList, 2) // move clock by a millisecond past maxAge duration, then 1 image will be garbage collected fakeClock.Step(policy.MaxAge + 1) - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + images, err = manager.freeOldImages(ctx, images, fakeClock.Now(), oldStartTime) require.NoError(t, err) assert.Len(images, 0) assert.Len(fakeRuntime.ImageList, 1) @@ -879,8 +880,10 @@ func TestGarbageCollectImageMaxAgeDisabled(t *testing.T) { require.Equal(t, len(images), 1) assert.Len(fakeRuntime.ImageList, 2) + oldStartTime := fakeClock.Now() + // First GC round should not GC remaining image, as it was used too recently. - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + images, err = manager.freeOldImages(ctx, images, oldStartTime, oldStartTime) require.NoError(t, err) assert.Len(images, 1) assert.Len(fakeRuntime.ImageList, 2) @@ -888,7 +891,7 @@ func TestGarbageCollectImageMaxAgeDisabled(t *testing.T) { // Move clock by a lot, and the images should continue to not be garbage colleced // See https://stackoverflow.com/questions/25065055/what-is-the-maximum-time-time-in-go fakeClock.SetTime(time.Unix(1<<63-62135596801, 999999999)) - images, err = manager.freeOldImages(ctx, images, fakeClock.Now()) + images, err = manager.freeOldImages(ctx, images, fakeClock.Now(), oldStartTime) require.NoError(t, err) assert.Len(images, 1) assert.Len(fakeRuntime.ImageList, 2) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index dbf8f554c07..ae5b5890a24 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1440,9 +1440,10 @@ func (kl *Kubelet) StartGarbageCollection() { } prevImageGCFailed := false + beganGC := time.Now() go wait.Until(func() { ctx := context.Background() - if err := kl.imageManager.GarbageCollect(ctx); err != nil { + if err := kl.imageManager.GarbageCollect(ctx, beganGC); err != nil { if prevImageGCFailed { klog.ErrorS(err, "Image garbage collection failed multiple times in a row") // Only create an event for repeated failures From 8dddf6d314066b5f79c34b4c6c0bd41e53a69236 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Wed, 15 Nov 2023 15:16:10 -0500 Subject: [PATCH 4/5] e2e_node: add test for max age after kubelet start Signed-off-by: Peter Hunt --- test/e2e_node/image_gc_test.go | 55 +++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/test/e2e_node/image_gc_test.go b/test/e2e_node/image_gc_test.go index 753cec63afd..118c078285e 100644 --- a/test/e2e_node/image_gc_test.go +++ b/test/e2e_node/image_gc_test.go @@ -34,8 +34,14 @@ import ( "github.com/onsi/gomega" ) -// var _ = SIGDescribe("ImageGarbageCollect [Serial][NodeFeature:GarbageCollect]", func() { -var _ = SIGDescribe("ImageGarbageCollect", func() { +const ( + // Kubelet GC's on a frequency of once every 5 minutes + // Add a little leeway to give it time. + checkGCUntil time.Duration = 6 * time.Minute + checkGCFreq time.Duration = 30 * time.Second +) + +var _ = SIGDescribe("ImageGarbageCollect", framework.WithSerial(), framework.WithNodeFeature("GarbageCollect"), func() { f := framework.NewDefaultFramework("image-garbage-collect-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged var is internalapi.ImageManagerService @@ -45,11 +51,11 @@ var _ = SIGDescribe("ImageGarbageCollect", func() { framework.ExpectNoError(err) }) ginkgo.AfterEach(func() { - // framework.ExpectNoError(PrePullAllImages()) + framework.ExpectNoError(PrePullAllImages()) }) ginkgo.Context("when ImageMaximumGCAge is set", func() { tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { - initialConfig.ImageMaximumGCAge = metav1.Duration{Duration: time.Duration(time.Second * 30)} + initialConfig.ImageMaximumGCAge = metav1.Duration{Duration: time.Duration(time.Minute * 1)} initialConfig.ImageMinimumGCAge = metav1.Duration{Duration: time.Duration(time.Second * 1)} if initialConfig.FeatureGates == nil { initialConfig.FeatureGates = make(map[string]bool) @@ -68,11 +74,44 @@ var _ = SIGDescribe("ImageGarbageCollect", func() { e2epod.NewPodClient(f).DeleteSync(ctx, pod.ObjectMeta.Name, metav1.DeleteOptions{}, e2epod.DefaultPodDeletionTimeout) - time.Sleep(time.Minute * 2) + // Even though the image gc max timing is less, we are bound by the kubelet's + // ImageGCPeriod, which is hardcoded to 5 minutes. + gomega.Eventually(ctx, func() int { + gcdImageList, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) + framework.ExpectNoError(err) + return len(gcdImageList) + }, checkGCUntil, checkGCFreq).Should(gomega.BeNumerically("<", len(allImages))) + }) + ginkgo.It("should not GC unused images prematurely", func(ctx context.Context) { + pod := innocentPod() + e2epod.NewPodClient(f).CreateBatch(ctx, []*v1.Pod{pod}) - // All images but one is unused, so expect them to be garbage collected - gcdImageList, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) - gomega.Expect(len(allImages)).To(gomega.BeNumerically(">", len(gcdImageList))) + _, err := is.PullImage(context.Background(), &runtimeapi.ImageSpec{Image: agnhostImage}, nil, nil) + framework.ExpectNoError(err) + + allImages, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) + framework.ExpectNoError(err) + + e2epod.NewPodClient(f).DeleteSync(ctx, pod.ObjectMeta.Name, metav1.DeleteOptions{}, e2epod.DefaultPodDeletionTimeout) + + restartKubelet(true) + waitForKubeletToStart(ctx, f) + + // Wait until the maxAge of the image after the kubelet is restarted to ensure it doesn't + // GC too early. + gomega.Consistently(ctx, func() int { + gcdImageList, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) + framework.ExpectNoError(err) + return len(gcdImageList) + }, 50*time.Second, 10*time.Second).Should(gomega.Equal(len(allImages))) + + // Even though the image gc max timing is less, we are bound by the kubelet's + // ImageGCPeriod, which is hardcoded to 5 minutes. + gomega.Eventually(ctx, func() int { + gcdImageList, err := is.ListImages(context.Background(), &runtimeapi.ImageFilter{}) + framework.ExpectNoError(err) + return len(gcdImageList) + }, checkGCUntil, checkGCFreq).Should(gomega.BeNumerically("<", len(allImages))) }) }) }) From ba8fcb5ef6c65e133c0629c3bb6989ff727ead33 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Fri, 16 Feb 2024 12:56:03 -0500 Subject: [PATCH 5/5] kubelet: don't disable gc if max age is specified Signed-off-by: Peter Hunt --- pkg/kubelet/kubelet.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index ae5b5890a24..46034ff756e 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1433,9 +1433,11 @@ func (kl *Kubelet) StartGarbageCollection() { } }, ContainerGCPeriod, wait.NeverStop) - // when the high threshold is set to 100, stub the image GC manager - if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 { - klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100, Disable image GC") + // when the high threshold is set to 100, and the max age is 0 (or the max age feature is disabled) + // stub the image GC manager + if kl.kubeletConfiguration.ImageGCHighThresholdPercent == 100 && + (!utilfeature.DefaultFeatureGate.Enabled(features.ImageMaximumGCAge) || kl.kubeletConfiguration.ImageMaximumGCAge.Duration == 0) { + klog.V(2).InfoS("ImageGCHighThresholdPercent is set 100 and ImageMaximumGCAge is 0, Disable image GC") return }