From f913b7afe88352d4589d78bf4ed38db45a9cbc68 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 5 Nov 2024 01:59:38 +0000 Subject: [PATCH 01/18] Adding imagepull backoff test Signed-off-by: Laura Lorenz --- test/e2e_node/criproxy_test.go | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 949a2379cbc..0e32e9f94f2 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -82,6 +82,41 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) }) + ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { + expectedErr := fmt.Errorf("PullImage failed") + err := addCRIProxyInjector(func(apiName string) error { + if apiName == criproxy.PullImage { + return expectedErr + } + return nil + }) + framework.ExpectNoError(err) + + pod := e2epod.NewPodClient(f).Create(ctx, newPullImageAlwaysPod()) + podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "ImagePullBackOff", 1*time.Minute, func(pod *v1.Pod) (bool, error) { + if len(pod.Status.ContainerStatuses) > 0 && pod.Status.Reason == images.ErrImagePullBackOff.Error() { + return true, nil + } + return false, nil + }) + gomega.Expect(podErr).To(gomega.HaveOccurred()) + + eventMsg, err := getFailedToPullImageMsg(ctx, f, pod.Name) + framework.ExpectNoError(err) + isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) + gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) + + // remove error so after backoff we will succeed + resetCRIProxyInjector() + + podErr = e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod) + framework.ExpectNoError(podErr) + + durations, err := getImageBackOffDurations(ctx, f, pod.Name) + framework.ExpectNoError(err) + gomega.Expect(durations[0]).Should(gomega.BeNumerically("~", time.Duration(10*time.Second), time.Duration(2*time.Second))) + + }) }) ginkgo.Context("Inject a pull image timeout exception into the CriProxy", func() { @@ -133,6 +168,48 @@ func getFailedToPullImageMsg(ctx context.Context, f *framework.Framework, podNam return "", fmt.Errorf("failed to find FailedToPullImage event for pod: %s", podName) } +func getImageBackOffDurations(ctx context.Context, f *framework.Framework, podName string) ([]time.Duration, error) { + events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, err + } + + var backoffs []time.Duration + + type BackOffRecord struct { + podName string + initialEventTime time.Time + backoffEventTimes []time.Time + duration time.Duration + } + + records := make(map[int]*BackOffRecord) + var backoffCount int + var pullTime time.Time + var r *BackOffRecord + for _, event := range events.Items { + if event.InvolvedObject.Name == podName { + switch event.Reason { + case kubeletevents.PullingImage: + if !pullTime.IsZero() { + if event.FirstTimestamp.Time.After(pullTime) { + r = records[backoffCount] + r.duration = r.initialEventTime.Sub(r.backoffEventTimes[len(r.backoffEventTimes)-1]) + backoffs = append(backoffs, r.duration) + backoffCount++ + } + } + pullTime = event.FirstTimestamp.Time + records[backoffCount].initialEventTime = pullTime + case kubeletevents.BackOffPullImage: + current := records[backoffCount].backoffEventTimes + current = append(current, event.FirstTimestamp.Time) + } + } + } + return backoffs, nil +} + func getPodImagePullDuration(ctx context.Context, f *framework.Framework, podName string) (time.Duration, error) { events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) if err != nil { From 6337a28a682126c4e8b243c40face3821341c787 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 5 Nov 2024 02:36:30 +0000 Subject: [PATCH 02/18] Organize into its own context Signed-off-by: Laura Lorenz --- test/e2e_node/criproxy_test.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 0e32e9f94f2..21bd79bc2f8 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -82,6 +82,20 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) }) + }) + + ginkgo.Context("Inject a pull image error exception, then reset it, into the CriProxy", func() { + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector() + framework.ExpectNoError(err) + }) + ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { expectedErr := fmt.Errorf("PullImage failed") err := addCRIProxyInjector(func(apiName string) error { @@ -107,7 +121,7 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) // remove error so after backoff we will succeed - resetCRIProxyInjector() + _ = resetCRIProxyInjector() podErr = e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod) framework.ExpectNoError(podErr) @@ -189,11 +203,13 @@ func getImageBackOffDurations(ctx context.Context, f *framework.Framework, podNa var r *BackOffRecord for _, event := range events.Items { if event.InvolvedObject.Name == podName { + switch event.Reason { case kubeletevents.PullingImage: if !pullTime.IsZero() { if event.FirstTimestamp.Time.After(pullTime) { r = records[backoffCount] + r.podName = podName r.duration = r.initialEventTime.Sub(r.backoffEventTimes[len(r.backoffEventTimes)-1]) backoffs = append(backoffs, r.duration) backoffCount++ From 6ef05dbd012c31cb2d74f046e5b5ecb3110211f1 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 5 Nov 2024 06:09:06 +0000 Subject: [PATCH 03/18] The idea of how this test should work Signed-off-by: Laura Lorenz --- test/e2e_node/criproxy_test.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 21bd79bc2f8..7de25fa3553 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -97,6 +97,7 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { }) ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { + // inject PullImage failed to trigger backoff expectedErr := fmt.Errorf("PullImage failed") err := addCRIProxyInjector(func(apiName string) error { if apiName == criproxy.PullImage { @@ -120,15 +121,22 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) - // remove error so after backoff we will succeed + // Wait for ~40s worth of backoffs to occur so we can confirm the backoff growth. + podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 40*time.Second) + gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeadedly backing off image pulls") + + // Remove error so after next backoff we will succeed. _ = resetCRIProxyInjector() podErr = e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod) framework.ExpectNoError(podErr) + // Parse observed backoffs (TODO: don't use Events, but logs) and compare to expectations durations, err := getImageBackOffDurations(ctx, f, pod.Name) framework.ExpectNoError(err) + gomega.Expect(durations).Error().ShouldNot(gomega.BeNil(), "Should have observed backoffs in Pod event log") gomega.Expect(durations[0]).Should(gomega.BeNumerically("~", time.Duration(10*time.Second), time.Duration(2*time.Second))) + // TODO: and check the next set of durations are 2x, etc }) }) @@ -191,25 +199,26 @@ func getImageBackOffDurations(ctx context.Context, f *framework.Framework, podNa var backoffs []time.Duration type BackOffRecord struct { - podName string initialEventTime time.Time backoffEventTimes []time.Time duration time.Duration } records := make(map[int]*BackOffRecord) + records[0] = &BackOffRecord{} var backoffCount int var pullTime time.Time var r *BackOffRecord + // I'm doing this here for events but really it needs to be off kubelet logs or some kind of synchronous watch + // Because the events normalize to the latest + // But this is the idea for _, event := range events.Items { if event.InvolvedObject.Name == podName { - switch event.Reason { case kubeletevents.PullingImage: if !pullTime.IsZero() { if event.FirstTimestamp.Time.After(pullTime) { r = records[backoffCount] - r.podName = podName r.duration = r.initialEventTime.Sub(r.backoffEventTimes[len(r.backoffEventTimes)-1]) backoffs = append(backoffs, r.duration) backoffCount++ From 2479d91f2a443850c29c02e0cf64faed6c620cc2 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Mon, 11 Nov 2024 22:34:15 +0000 Subject: [PATCH 04/18] Fix test to count pull tries Signed-off-by: Laura Lorenz --- test/e2e_node/criproxy_test.go | 97 ++++++++++++++-------------------- 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 7de25fa3553..7438c0ca150 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -84,22 +84,22 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { }) }) - ginkgo.Context("Inject a pull image error exception, then reset it, into the CriProxy", func() { + ginkgo.Context("Image pull backoff", func() { ginkgo.BeforeEach(func() { - if err := resetCRIProxyInjector(); err != nil { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") } }) ginkgo.AfterEach(func() { - err := resetCRIProxyInjector() + err := resetCRIProxyInjector(e2eCriProxy) framework.ExpectNoError(err) }) ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { // inject PullImage failed to trigger backoff expectedErr := fmt.Errorf("PullImage failed") - err := addCRIProxyInjector(func(apiName string) error { + err := addCRIProxyInjector(e2eCriProxy, func(apiName string) error { if apiName == criproxy.PullImage { return expectedErr } @@ -107,7 +107,8 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { }) framework.ExpectNoError(err) - pod := e2epod.NewPodClient(f).Create(ctx, newPullImageAlwaysPod()) + pod := e2epod.NewPodClient(f).Create(ctx, newImageBackoffPod()) + framework.Logf("pod name: %s", pod.Name) podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "ImagePullBackOff", 1*time.Minute, func(pod *v1.Pod) (bool, error) { if len(pod.Status.ContainerStatuses) > 0 && pod.Status.Reason == images.ErrImagePullBackOff.Error() { return true, nil @@ -121,22 +122,16 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) - // Wait for ~40s worth of backoffs to occur so we can confirm the backoff growth. - podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 40*time.Second) + // Wait for ~60s worth of backoffs to occur so we can confirm the backoff growth. + podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 1*time.Minute) gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeadedly backing off image pulls") - // Remove error so after next backoff we will succeed. - _ = resetCRIProxyInjector() - - podErr = e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod) - framework.ExpectNoError(podErr) - - // Parse observed backoffs (TODO: don't use Events, but logs) and compare to expectations - durations, err := getImageBackOffDurations(ctx, f, pod.Name) + e, err := getImageBackoffs(ctx, f, pod.Name) framework.ExpectNoError(err) - gomega.Expect(durations).Error().ShouldNot(gomega.BeNil(), "Should have observed backoffs in Pod event log") - gomega.Expect(durations[0]).Should(gomega.BeNumerically("~", time.Duration(10*time.Second), time.Duration(2*time.Second))) - // TODO: and check the next set of durations are 2x, etc + // 3 would take 10s best case + gomega.Expect(e.Count).Should(gomega.BeNumerically(">", 3)) + // 6 would take 150s best case + gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 6)) }) }) @@ -190,49 +185,20 @@ func getFailedToPullImageMsg(ctx context.Context, f *framework.Framework, podNam return "", fmt.Errorf("failed to find FailedToPullImage event for pod: %s", podName) } -func getImageBackOffDurations(ctx context.Context, f *framework.Framework, podName string) ([]time.Duration, error) { - events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) +func getImageBackoffs(ctx context.Context, f *framework.Framework, podName string) (v1.Event, error) { + + event := v1.Event{} + e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) if err != nil { - return nil, err + return event, err } - var backoffs []time.Duration - - type BackOffRecord struct { - initialEventTime time.Time - backoffEventTimes []time.Time - duration time.Duration - } - - records := make(map[int]*BackOffRecord) - records[0] = &BackOffRecord{} - var backoffCount int - var pullTime time.Time - var r *BackOffRecord - // I'm doing this here for events but really it needs to be off kubelet logs or some kind of synchronous watch - // Because the events normalize to the latest - // But this is the idea - for _, event := range events.Items { - if event.InvolvedObject.Name == podName { - switch event.Reason { - case kubeletevents.PullingImage: - if !pullTime.IsZero() { - if event.FirstTimestamp.Time.After(pullTime) { - r = records[backoffCount] - r.duration = r.initialEventTime.Sub(r.backoffEventTimes[len(r.backoffEventTimes)-1]) - backoffs = append(backoffs, r.duration) - backoffCount++ - } - } - pullTime = event.FirstTimestamp.Time - records[backoffCount].initialEventTime = pullTime - case kubeletevents.BackOffPullImage: - current := records[backoffCount].backoffEventTimes - current = append(current, event.FirstTimestamp.Time) - } + for _, event := range e.Items { + if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.PullingImage { + return event, nil } } - return backoffs, nil + return event, nil } func getPodImagePullDuration(ctx context.Context, f *framework.Framework, podName string) (time.Duration, error) { @@ -260,6 +226,25 @@ func getPodImagePullDuration(ctx context.Context, f *framework.Framework, podNam return endTime.Sub(startTime), nil } +func newImageBackoffPod() *v1.Pod { + podName := "image-backoff" + string(uuid.NewUUID()) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Image: imageutils.GetPauseImageName(), + Name: podName, + ImagePullPolicy: v1.PullAlways, + }, + }, + }, + } + return pod +} + func newPullImageAlwaysPod() *v1.Pod { podName := "cri-proxy-test-" + string(uuid.NewUUID()) pod := &v1.Pod{ From fc4ac5efebd97124b7c608e0fe61ee9eb13c3f74 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 01:26:48 +0000 Subject: [PATCH 05/18] Move image pull backoff test to be with other image pull tests Signed-off-by: Laura Lorenz --- test/e2e_node/criproxy_test.go | 71 -------------------------------- test/e2e_node/image_pull_test.go | 55 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 71 deletions(-) diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 7438c0ca150..93f87a93f76 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -96,44 +96,6 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { framework.ExpectNoError(err) }) - ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { - // inject PullImage failed to trigger backoff - expectedErr := fmt.Errorf("PullImage failed") - err := addCRIProxyInjector(e2eCriProxy, func(apiName string) error { - if apiName == criproxy.PullImage { - return expectedErr - } - return nil - }) - framework.ExpectNoError(err) - - pod := e2epod.NewPodClient(f).Create(ctx, newImageBackoffPod()) - framework.Logf("pod name: %s", pod.Name) - podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "ImagePullBackOff", 1*time.Minute, func(pod *v1.Pod) (bool, error) { - if len(pod.Status.ContainerStatuses) > 0 && pod.Status.Reason == images.ErrImagePullBackOff.Error() { - return true, nil - } - return false, nil - }) - gomega.Expect(podErr).To(gomega.HaveOccurred()) - - eventMsg, err := getFailedToPullImageMsg(ctx, f, pod.Name) - framework.ExpectNoError(err) - isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) - gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) - - // Wait for ~60s worth of backoffs to occur so we can confirm the backoff growth. - podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 1*time.Minute) - gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeadedly backing off image pulls") - - e, err := getImageBackoffs(ctx, f, pod.Name) - framework.ExpectNoError(err) - // 3 would take 10s best case - gomega.Expect(e.Count).Should(gomega.BeNumerically(">", 3)) - // 6 would take 150s best case - gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 6)) - - }) }) ginkgo.Context("Inject a pull image timeout exception into the CriProxy", func() { @@ -185,21 +147,7 @@ func getFailedToPullImageMsg(ctx context.Context, f *framework.Framework, podNam return "", fmt.Errorf("failed to find FailedToPullImage event for pod: %s", podName) } -func getImageBackoffs(ctx context.Context, f *framework.Framework, podName string) (v1.Event, error) { - event := v1.Event{} - e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) - if err != nil { - return event, err - } - - for _, event := range e.Items { - if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.PullingImage { - return event, nil - } - } - return event, nil -} func getPodImagePullDuration(ctx context.Context, f *framework.Framework, podName string) (time.Duration, error) { events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) @@ -226,25 +174,6 @@ func getPodImagePullDuration(ctx context.Context, f *framework.Framework, podNam return endTime.Sub(startTime), nil } -func newImageBackoffPod() *v1.Pod { - podName := "image-backoff" + string(uuid.NewUUID()) - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - }, - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Image: imageutils.GetPauseImageName(), - Name: podName, - ImagePullPolicy: v1.PullAlways, - }, - }, - }, - } - return pod -} - func newPullImageAlwaysPod() *v1.Pod { podName := "cri-proxy-test-" + string(uuid.NewUUID()) pod := &v1.Pod{ diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go index 21e4a113a88..ecaa67e733d 100644 --- a/test/e2e_node/image_pull_test.go +++ b/test/e2e_node/image_pull_test.go @@ -34,6 +34,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubeletevents "k8s.io/kubernetes/pkg/kubelet/events" + "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" @@ -230,6 +231,45 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func }) }) + + ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { + // inject PullImage failed to trigger backoff + expectedErr := fmt.Errorf("PullImage failed") + err := addCRIProxyInjector(e2eCriProxy, func(apiName string) error { + if apiName == criproxy.PullImage { + return expectedErr + } + return nil + }) + framework.ExpectNoError(err) + + pod := e2epod.NewPodClient(f).Create(ctx, newPullImageAlwaysPod()) + podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "ImagePullBackOff", 1*time.Minute, func(pod *v1.Pod) (bool, error) { + if len(pod.Status.ContainerStatuses) > 0 && pod.Status.Reason == images.ErrImagePullBackOff.Error() { + return true, nil + } + return false, nil + }) + gomega.Expect(podErr).To(gomega.HaveOccurred()) + + eventMsg, err := getFailedToPullImageMsg(ctx, f, pod.Name) + framework.ExpectNoError(err) + isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) + gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) + + // Wait for ~60s worth of backoffs to occur so we can confirm the backoff growth. + podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 1*time.Minute) + gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeadedly backing off image pulls") + + e, err := getImagePullAttempts(ctx, f, pod.Name) + framework.ExpectNoError(err) + // 3 would take 10s best case + gomega.Expect(e.Count).Should(gomega.BeNumerically(">", 3)) + // 6 would take 150s best case + gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 6)) + + }) + }) func getPodImagePullDurations(ctx context.Context, f *framework.Framework, testpods []*v1.Pod) (map[string]*pulledStruct, map[string]metav1.Time, map[string]metav1.Time, error) { @@ -343,3 +383,18 @@ func getDurationsFromPulledEventMsg(msg string) (*pulledStruct, error) { pulledIncludeWaitingDuration: pulledIncludeWaitingDuration, }, nil } + +func getImagePullAttempts(ctx context.Context, f *framework.Framework, podName string) (v1.Event, error) { + event := v1.Event{} + e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) + if err != nil { + return event, err + } + + for _, event := range e.Items { + if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.PullingImage { + return event, nil + } + } + return event, nil +} From 15bae1eadf74c4d12adcb50c8eea48e2a429a36d Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 04:30:46 +0000 Subject: [PATCH 06/18] Add container restart test too Signed-off-by: Laura Lorenz --- test/e2e/framework/pod/wait.go | 15 +++ test/e2e_node/container_restart_test.go | 120 ++++++++++++++++++++++++ test/utils/image/manifest.go | 5 + 3 files changed, 140 insertions(+) create mode 100644 test/e2e_node/container_restart_test.go diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 537d59c3224..40cf4db53b4 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -868,3 +868,18 @@ func WaitForContainerTerminated(ctx context.Context, c clientset.Interface, name return false, nil }) } + +// WaitForContainerRestartedNTimes waits for the given Pod container to have restarted N times +func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, timeout time.Duration, target int) error { + conditionDesc := fmt.Sprintf("container %s restarted %d times", containerName, target) + return WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { + for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} { + for _, cs := range statuses { + if cs.Name == containerName { + return cs.RestartCount >= int32(target), nil + } + } + } + return false, nil + }) +} diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go new file mode 100644 index 00000000000..631d29e3bf5 --- /dev/null +++ b/test/e2e_node/container_restart_test.go @@ -0,0 +1,120 @@ +package e2enode + +import ( + "context" + "time" + + imageutils "k8s.io/kubernetes/test/utils/image" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/uuid" + kubeletevents "k8s.io/kubernetes/pkg/kubelet/events" + "k8s.io/kubernetes/test/e2e/feature" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + admissionapi "k8s.io/pod-security-admission/api" +) + +var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(), func() { + f := framework.NewDefaultFramework("container-restart") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + + ginkgo.Context("Container restart backs off", func() { + + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + ginkgo.It("Container restart backs off.", func(ctx context.Context) { + // 3 would take 10s best case, 6 would take 150s best case + doTest(ctx, f, 5, time.Duration(80*time.Second), time.Duration(10*time.Second)) + }) + }) + + ginkgo.Context("Alternate container restart backs off as expected", func() { + + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(30 * time.Second)} + initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true} + }) + + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + ginkgo.It("Alternate restart backs off.", func(ctx context.Context) { + doTest(ctx, f, 7, time.Duration(120*time.Second), time.Duration(10*time.Second)) + }) + }) +}) + +func doTest(ctx context.Context, f *framework.Framework, maxRestarts int, target time.Duration, threshold time.Duration) { + + pod := e2epod.NewPodClient(f).Create(ctx, newFailAlwaysPod()) + podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute) + gomega.Expect(podErr).To(gomega.HaveOccurred()) + + // Wait for 120s worth of backoffs to occur so we can confirm the backoff growth. + podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "restart", 150*time.Second, maxRestarts) + gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") + + d, err := getContainerRetryDuration(ctx, f, pod.Name) + framework.ExpectNoError(err) + + gomega.Expect(d).Should(gomega.BeNumerically("~", target, threshold)) +} + +func getContainerRetryDuration(ctx context.Context, f *framework.Framework, podName string) (time.Duration, error) { + + var d time.Duration + e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) + if err != nil { + return d, err + } + + for _, event := range e.Items { + if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.StartedContainer { + return event.LastTimestamp.Time.Sub(event.FirstTimestamp.Time), nil + } + } + return d, nil +} + +func newFailAlwaysPod() *v1.Pod { + podName := "container-restart" + string(uuid.NewUUID()) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "restart", + Image: imageutils.GetBusyBoxImageName(), + ImagePullPolicy: v1.PullAlways, + Command: []string{"exit 1"}, + }, + }, + }, + } + return pod +} diff --git a/test/utils/image/manifest.go b/test/utils/image/manifest.go index 2bc34b09292..a027e7892d8 100644 --- a/test/utils/image/manifest.go +++ b/test/utils/image/manifest.go @@ -355,6 +355,11 @@ func GetPauseImageName() string { return GetE2EImage(Pause) } +// GetBusyBoxImageName returns the busybox image name with proper version +func GetBusyBoxImageName() string { + return GetE2EImage(BusyBox) +} + // ReplaceRegistryInImageURL replaces the registry in the image URL with a custom one based // on the configured registries. func ReplaceRegistryInImageURL(imageURL string) (string, error) { From bad037b505efd2a56ac41f7135438d7c6ec37d2f Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 04:48:10 +0000 Subject: [PATCH 07/18] Formatting Signed-off-by: Laura Lorenz --- test/e2e_node/criproxy_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 93f87a93f76..63339f321f5 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -147,8 +147,6 @@ func getFailedToPullImageMsg(ctx context.Context, f *framework.Framework, podNam return "", fmt.Errorf("failed to find FailedToPullImage event for pod: %s", podName) } - - func getPodImagePullDuration(ctx context.Context, f *framework.Framework, podName string) (time.Duration, error) { events, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) if err != nil { From f032068ef75ae34fdc0bc406b91b05df19c22365 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 07:04:32 +0000 Subject: [PATCH 08/18] Focus on restart numbers instead of timing Signed-off-by: Laura Lorenz --- test/e2e/framework/pod/wait.go | 2 +- test/e2e_node/container_restart_test.go | 45 +++++++++++++------------ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 40cf4db53b4..6e69f8ec487 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -871,7 +871,7 @@ func WaitForContainerTerminated(ctx context.Context, c clientset.Interface, name // WaitForContainerRestartedNTimes waits for the given Pod container to have restarted N times func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, timeout time.Duration, target int) error { - conditionDesc := fmt.Sprintf("container %s restarted %d times", containerName, target) + conditionDesc := fmt.Sprintf("container %s restarted at least %d times", containerName, target) return WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} { for _, cs := range statuses { diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 631d29e3bf5..503b3b2d0e2 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -13,13 +13,14 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" - kubeletevents "k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" admissionapi "k8s.io/pod-security-admission/api" ) +const containerName = "restarts" + var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(), func() { f := framework.NewDefaultFramework("container-restart") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged @@ -38,8 +39,8 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial( }) ginkgo.It("Container restart backs off.", func(ctx context.Context) { - // 3 would take 10s best case, 6 would take 150s best case - doTest(ctx, f, 5, time.Duration(80*time.Second), time.Duration(10*time.Second)) + // 0s, 0s, 10s, 30s, 70s, 150s, 310s + doTest(ctx, f, 5, containerName, 7) }) }) @@ -62,41 +63,42 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial( }) ginkgo.It("Alternate restart backs off.", func(ctx context.Context) { - doTest(ctx, f, 7, time.Duration(120*time.Second), time.Duration(10*time.Second)) + // 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150, 180, 210) + doTest(ctx, f, 7, containerName, 10) }) }) }) -func doTest(ctx context.Context, f *framework.Framework, maxRestarts int, target time.Duration, threshold time.Duration) { +func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) { pod := e2epod.NewPodClient(f).Create(ctx, newFailAlwaysPod()) podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute) gomega.Expect(podErr).To(gomega.HaveOccurred()) - // Wait for 120s worth of backoffs to occur so we can confirm the backoff growth. - podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "restart", 150*time.Second, maxRestarts) + // Wait for 150s worth of backoffs to occur so we can confirm the backoff growth. + podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "restart", 150*time.Second, targetRestarts) gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") - d, err := getContainerRetryDuration(ctx, f, pod.Name) + r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) framework.ExpectNoError(err) - gomega.Expect(d).Should(gomega.BeNumerically("~", target, threshold)) + gomega.Expect(r).Should(gomega.BeNumerically("<=", maxRestarts)) } -func getContainerRetryDuration(ctx context.Context, f *framework.Framework, podName string) (time.Duration, error) { - - var d time.Duration - e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) +func extractObservedBackoff(ctx context.Context, f *framework.Framework, podName string, containerName string) (int32, error) { + var r int32 + pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, podName, metav1.GetOptions{}) if err != nil { - return d, err + return r, err } - - for _, event := range e.Items { - if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.StartedContainer { - return event.LastTimestamp.Time.Sub(event.FirstTimestamp.Time), nil + for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} { + for _, cs := range statuses { + if cs.Name == containerName { + return r, nil + } } } - return d, nil + return r, nil } func newFailAlwaysPod() *v1.Pod { @@ -108,10 +110,9 @@ func newFailAlwaysPod() *v1.Pod { Spec: v1.PodSpec{ Containers: []v1.Container{ { - Name: "restart", + Name: containerName, Image: imageutils.GetBusyBoxImageName(), - ImagePullPolicy: v1.PullAlways, - Command: []string{"exit 1"}, + ImagePullPolicy: v1.PullIfNotPresent, }, }, }, From e6059d7386479770ab5588dd33cc168b5c76e2e2 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 19:48:38 +0000 Subject: [PATCH 09/18] Fix typecheck and verify Signed-off-by: Laura Lorenz --- test/e2e_node/container_restart_test.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 503b3b2d0e2..17445ae7ada 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -1,3 +1,22 @@ +//go:build linux +// +build linux + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package e2enode import ( From 2732d57e33fd8c45a6f4856fce52f7c8ed8073c9 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 18:04:08 +0000 Subject: [PATCH 10/18] Missed refactor of container name here Signed-off-by: Laura Lorenz --- test/e2e_node/container_restart_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 17445ae7ada..97c908ef250 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -94,8 +94,8 @@ func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, con podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute) gomega.Expect(podErr).To(gomega.HaveOccurred()) - // Wait for 150s worth of backoffs to occur so we can confirm the backoff growth. - podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "restart", 150*time.Second, targetRestarts) + // Wait for 210s worth of backoffs to occur so we can confirm the backoff growth. + podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 210*time.Second, targetRestarts) gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) From 1da8ca816eb8f7303b10508cdcbc4e0553285fd5 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 20:00:11 +0000 Subject: [PATCH 11/18] Extract restart number properly Signed-off-by: Laura Lorenz --- test/e2e_node/container_restart_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 97c908ef250..b7b5eacc4d1 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -113,7 +113,7 @@ func extractObservedBackoff(ctx context.Context, f *framework.Framework, podName for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} { for _, cs := range statuses { if cs.Name == containerName { - return r, nil + return cs.RestartCount, nil } } } From d293c5088fa110516cd2aec295f13ca1209e15e3 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 21:12:20 +0000 Subject: [PATCH 12/18] Fix spelling Signed-off-by: Laura Lorenz --- test/e2e_node/image_pull_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go index ecaa67e733d..d759a883143 100644 --- a/test/e2e_node/image_pull_test.go +++ b/test/e2e_node/image_pull_test.go @@ -259,7 +259,7 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func // Wait for ~60s worth of backoffs to occur so we can confirm the backoff growth. podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 1*time.Minute) - gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeadedly backing off image pulls") + gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeatedly backing off image pulls") e, err := getImagePullAttempts(ctx, f, pod.Name) framework.ExpectNoError(err) From e03d0f60ef27a777de278bb8969acf6c134b68f3 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 21:48:28 +0000 Subject: [PATCH 13/18] Orient tests to run faster, but tolerate infra slowdowns up to 5 minutes Signed-off-by: Laura Lorenz --- test/e2e_node/container_restart_test.go | 14 ++++++++------ test/e2e_node/image_pull_test.go | 9 ++++----- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index b7b5eacc4d1..5a2d530f3f2 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -27,6 +27,7 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + "github.com/pkg/errors" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" v1 "k8s.io/api/core/v1" @@ -59,7 +60,7 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial( ginkgo.It("Container restart backs off.", func(ctx context.Context) { // 0s, 0s, 10s, 30s, 70s, 150s, 310s - doTest(ctx, f, 5, containerName, 7) + doTest(ctx, f, 3, containerName, 7) }) }) @@ -82,8 +83,8 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial( }) ginkgo.It("Alternate restart backs off.", func(ctx context.Context) { - // 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150, 180, 210) - doTest(ctx, f, 7, containerName, 10) + // 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s + doTest(ctx, f, 3, containerName, 13) }) }) }) @@ -94,8 +95,9 @@ func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, con podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute) gomega.Expect(podErr).To(gomega.HaveOccurred()) - // Wait for 210s worth of backoffs to occur so we can confirm the backoff growth. - podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 210*time.Second, targetRestarts) + // Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow. + time.Sleep(30 * time.Second) + podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts) gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) @@ -117,7 +119,7 @@ func extractObservedBackoff(ctx context.Context, f *framework.Framework, podName } } } - return r, nil + return r, errors.Errorf("Could not find container status for container %s in pod %s", containerName, podName) } func newFailAlwaysPod() *v1.Pod { diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go index d759a883143..fce685e5264 100644 --- a/test/e2e_node/image_pull_test.go +++ b/test/e2e_node/image_pull_test.go @@ -257,16 +257,15 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) - // Wait for ~60s worth of backoffs to occur so we can confirm the backoff growth. - podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 1*time.Minute) + podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 30*time.Second) gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeatedly backing off image pulls") e, err := getImagePullAttempts(ctx, f, pod.Name) framework.ExpectNoError(err) - // 3 would take 10s best case + // 3 would take 10s best case. gomega.Expect(e.Count).Should(gomega.BeNumerically(">", 3)) - // 6 would take 150s best case - gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 6)) + // 7 would take 310s best case, if the infra went slow. + gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 7)) }) From 285d433dea79e6838cdd0648c36a2a82fd398179 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 22:48:17 +0000 Subject: [PATCH 14/18] Clearer image pull test and utils Signed-off-by: Laura Lorenz --- .../k8s.io/kubectl/pkg/util/podutils/podutils.go | 9 ++++++--- test/e2e/framework/pod/wait.go | 16 +++++----------- test/e2e_node/container_restart_test.go | 2 +- test/e2e_node/image_pull_test.go | 4 ++-- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go b/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go index dfefdef4567..9e457f406d3 100644 --- a/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go +++ b/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go @@ -191,7 +191,10 @@ func podReadyTime(pod *corev1.Pod) *metav1.Time { return &metav1.Time{} } -func maxContainerRestarts(pod *corev1.Pod) (regularRestarts, sidecarRestarts int) { +// MaxContainerRestarts iterates through all the normal containers and sidecar +// containers in a Pod object and reports the highest restart count observed per +// category. +func MaxContainerRestarts(pod *corev1.Pod) (regularRestarts, sidecarRestarts int) { for _, c := range pod.Status.ContainerStatuses { regularRestarts = max(regularRestarts, int(c.RestartCount)) } @@ -214,8 +217,8 @@ func maxContainerRestarts(pod *corev1.Pod) (regularRestarts, sidecarRestarts int // false: pj has a higher container restart count. // nil: Both have the same container restart count. func compareMaxContainerRestarts(pi *corev1.Pod, pj *corev1.Pod) *bool { - regularRestartsI, sidecarRestartsI := maxContainerRestarts(pi) - regularRestartsJ, sidecarRestartsJ := maxContainerRestarts(pj) + regularRestartsI, sidecarRestartsI := MaxContainerRestarts(pi) + regularRestartsJ, sidecarRestartsJ := MaxContainerRestarts(pj) if regularRestartsI != regularRestartsJ { res := regularRestartsI > regularRestartsJ return &res diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 6e69f8ec487..7a416c475c7 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -869,17 +869,11 @@ func WaitForContainerTerminated(ctx context.Context, c clientset.Interface, name }) } -// WaitForContainerRestartedNTimes waits for the given Pod container to have restarted N times -func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace, podName, containerName string, timeout time.Duration, target int) error { - conditionDesc := fmt.Sprintf("container %s restarted at least %d times", containerName, target) +// WaitForContainerRestartedNTimes waits for a container in the Pod to have restarted N times +func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace string, podName string, timeout time.Duration, target int) error { + conditionDesc := fmt.Sprintf("A container in pod %s restarted at least %d times", podName, target) return WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { - for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} { - for _, cs := range statuses { - if cs.Name == containerName { - return cs.RestartCount >= int32(target), nil - } - } - } - return false, nil + r, _ := podutils.MaxContainerRestarts(pod) + return r >= target, nil }) } diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 5a2d530f3f2..5fdbae30a11 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -97,7 +97,7 @@ func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, con // Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow. time.Sleep(30 * time.Second) - podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts) + podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 5*time.Minute, targetRestarts) gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go index fce685e5264..28c393f2a96 100644 --- a/test/e2e_node/image_pull_test.go +++ b/test/e2e_node/image_pull_test.go @@ -257,8 +257,8 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) - podErr = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, 30*time.Second) - gomega.Expect(podErr).To(gomega.HaveOccurred(), "Expected container not to start from repeatedly backing off image pulls") + // Hard wait 30 seconds for image pulls to repeatedly back off. + time.Sleep(30 * time.Second) e, err := getImagePullAttempts(ctx, f, pod.Name) framework.ExpectNoError(err) From 8e7b2af712a79db423b7bfdf2a7bf35f75542c13 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 23:26:04 +0000 Subject: [PATCH 15/18] Use a better util Signed-off-by: Laura Lorenz --- .../src/k8s.io/kubectl/pkg/util/podutils/podutils.go | 9 +++------ test/e2e/framework/pod/wait.go | 12 ++++++++---- test/e2e_node/container_restart_test.go | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go b/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go index 9e457f406d3..dfefdef4567 100644 --- a/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go +++ b/staging/src/k8s.io/kubectl/pkg/util/podutils/podutils.go @@ -191,10 +191,7 @@ func podReadyTime(pod *corev1.Pod) *metav1.Time { return &metav1.Time{} } -// MaxContainerRestarts iterates through all the normal containers and sidecar -// containers in a Pod object and reports the highest restart count observed per -// category. -func MaxContainerRestarts(pod *corev1.Pod) (regularRestarts, sidecarRestarts int) { +func maxContainerRestarts(pod *corev1.Pod) (regularRestarts, sidecarRestarts int) { for _, c := range pod.Status.ContainerStatuses { regularRestarts = max(regularRestarts, int(c.RestartCount)) } @@ -217,8 +214,8 @@ func MaxContainerRestarts(pod *corev1.Pod) (regularRestarts, sidecarRestarts int // false: pj has a higher container restart count. // nil: Both have the same container restart count. func compareMaxContainerRestarts(pi *corev1.Pod, pj *corev1.Pod) *bool { - regularRestartsI, sidecarRestartsI := MaxContainerRestarts(pi) - regularRestartsJ, sidecarRestartsJ := MaxContainerRestarts(pj) + regularRestartsI, sidecarRestartsI := maxContainerRestarts(pi) + regularRestartsJ, sidecarRestartsJ := maxContainerRestarts(pj) if regularRestartsI != regularRestartsJ { res := regularRestartsI > regularRestartsJ return &res diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 7a416c475c7..5cbaaab8a25 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -37,6 +37,7 @@ import ( apitypes "k8s.io/apimachinery/pkg/types" clientset "k8s.io/client-go/kubernetes" "k8s.io/kubectl/pkg/util/podutils" + podv1util "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/test/e2e/framework" testutils "k8s.io/kubernetes/test/utils" "k8s.io/kubernetes/test/utils/format" @@ -869,11 +870,14 @@ func WaitForContainerTerminated(ctx context.Context, c clientset.Interface, name }) } -// WaitForContainerRestartedNTimes waits for a container in the Pod to have restarted N times -func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace string, podName string, timeout time.Duration, target int) error { +// WaitForContainerRestartedNTimes waits for the given normal container in the Pod to have restarted N times +func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace string, podName string, containerName string, timeout time.Duration, target int) error { conditionDesc := fmt.Sprintf("A container in pod %s restarted at least %d times", podName, target) return WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { - r, _ := podutils.MaxContainerRestarts(pod) - return r >= target, nil + cs, found := podv1util.GetContainerStatus(pod.Status.ContainerStatuses, containerName) + if !found { + return false, fmt.Errorf("could not find container %s in pod %s", containerName, podName) + } + return cs.RestartCount >= int32(target), nil }) } diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 5fdbae30a11..5a2d530f3f2 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -97,7 +97,7 @@ func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, con // Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow. time.Sleep(30 * time.Second) - podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 5*time.Minute, targetRestarts) + podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts) gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) From 529d5ba9d33a09664718db9697fce9b2a93948c5 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 23:34:57 +0000 Subject: [PATCH 16/18] Don't overly indirect image name Signed-off-by: Laura Lorenz --- test/e2e_node/container_restart_test.go | 2 +- test/utils/image/manifest.go | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 5a2d530f3f2..48404b5e110 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -132,7 +132,7 @@ func newFailAlwaysPod() *v1.Pod { Containers: []v1.Container{ { Name: containerName, - Image: imageutils.GetBusyBoxImageName(), + Image: imageutils.GetE2EImage(imageutils.BusyBox), ImagePullPolicy: v1.PullIfNotPresent, }, }, diff --git a/test/utils/image/manifest.go b/test/utils/image/manifest.go index a027e7892d8..2bc34b09292 100644 --- a/test/utils/image/manifest.go +++ b/test/utils/image/manifest.go @@ -355,11 +355,6 @@ func GetPauseImageName() string { return GetE2EImage(Pause) } -// GetBusyBoxImageName returns the busybox image name with proper version -func GetBusyBoxImageName() string { - return GetE2EImage(BusyBox) -} - // ReplaceRegistryInImageURL replaces the registry in the image URL with a custom one based // on the configured registries. func ReplaceRegistryInImageURL(imageURL string) (string, error) { From 59f98580861e81441054b893efdcaeb0eac1a488 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Tue, 12 Nov 2024 23:59:30 +0000 Subject: [PATCH 17/18] Move function specific to container restart test inline Signed-off-by: Laura Lorenz --- test/e2e/framework/pod/wait.go | 13 ------------- test/e2e_node/container_restart_test.go | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/test/e2e/framework/pod/wait.go b/test/e2e/framework/pod/wait.go index 5cbaaab8a25..537d59c3224 100644 --- a/test/e2e/framework/pod/wait.go +++ b/test/e2e/framework/pod/wait.go @@ -37,7 +37,6 @@ import ( apitypes "k8s.io/apimachinery/pkg/types" clientset "k8s.io/client-go/kubernetes" "k8s.io/kubectl/pkg/util/podutils" - podv1util "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/test/e2e/framework" testutils "k8s.io/kubernetes/test/utils" "k8s.io/kubernetes/test/utils/format" @@ -869,15 +868,3 @@ func WaitForContainerTerminated(ctx context.Context, c clientset.Interface, name return false, nil }) } - -// WaitForContainerRestartedNTimes waits for the given normal container in the Pod to have restarted N times -func WaitForContainerRestartedNTimes(ctx context.Context, c clientset.Interface, namespace string, podName string, containerName string, timeout time.Duration, target int) error { - conditionDesc := fmt.Sprintf("A container in pod %s restarted at least %d times", podName, target) - return WaitForPodCondition(ctx, c, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { - cs, found := podv1util.GetContainerStatus(pod.Status.ContainerStatuses, containerName) - if !found { - return false, fmt.Errorf("could not find container %s in pod %s", containerName, podName) - } - return cs.RestartCount >= int32(target), nil - }) -} diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 48404b5e110..086ce2d7d4c 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -21,8 +21,10 @@ package e2enode import ( "context" + "fmt" "time" + podv1util "k8s.io/kubernetes/pkg/api/v1/pod" imageutils "k8s.io/kubernetes/test/utils/image" "github.com/onsi/ginkgo/v2" @@ -97,7 +99,7 @@ func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, con // Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow. time.Sleep(30 * time.Second) - podErr = e2epod.WaitForContainerRestartedNTimes(ctx, f.ClientSet, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts) + podErr = waitForContainerRestartedNTimes(ctx, f, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts) gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) @@ -140,3 +142,14 @@ func newFailAlwaysPod() *v1.Pod { } return pod } + +func waitForContainerRestartedNTimes(ctx context.Context, f *framework.Framework, namespace string, podName string, containerName string, timeout time.Duration, target int) error { + conditionDesc := fmt.Sprintf("A container in pod %s restarted at least %d times", podName, target) + return e2epod.WaitForPodCondition(ctx, f.ClientSet, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { + cs, found := podv1util.GetContainerStatus(pod.Status.ContainerStatuses, containerName) + if !found { + return false, fmt.Errorf("could not find container %s in pod %s", containerName, podName) + } + return cs.RestartCount >= int32(target), nil + }) +} From 9ab0d81d7667dd456a887dffe3c85f602de89fc7 Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Wed, 13 Nov 2024 01:39:58 +0000 Subject: [PATCH 18/18] Now that sleep is shorter, only expect to reach 3 within 30s Focused too much on the container restart one in commit that fixed that Signed-off-by: Laura Lorenz --- test/e2e_node/image_pull_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go index 28c393f2a96..a80795f8f37 100644 --- a/test/e2e_node/image_pull_test.go +++ b/test/e2e_node/image_pull_test.go @@ -263,7 +263,7 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func e, err := getImagePullAttempts(ctx, f, pod.Name) framework.ExpectNoError(err) // 3 would take 10s best case. - gomega.Expect(e.Count).Should(gomega.BeNumerically(">", 3)) + gomega.Expect(e.Count).Should(gomega.BeNumerically(">=", 3)) // 7 would take 310s best case, if the infra went slow. gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 7))