diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go new file mode 100644 index 00000000000..086ce2d7d4c --- /dev/null +++ b/test/e2e_node/container_restart_test.go @@ -0,0 +1,155 @@ +//go:build linux +// +build linux + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "context" + "fmt" + "time" + + podv1util "k8s.io/kubernetes/pkg/api/v1/pod" + imageutils "k8s.io/kubernetes/test/utils/image" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "github.com/pkg/errors" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/kubernetes/test/e2e/feature" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + admissionapi "k8s.io/pod-security-admission/api" +) + +const containerName = "restarts" + +var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(), func() { + f := framework.NewDefaultFramework("container-restart") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + + ginkgo.Context("Container restart backs off", func() { + + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + ginkgo.It("Container restart backs off.", func(ctx context.Context) { + // 0s, 0s, 10s, 30s, 70s, 150s, 310s + doTest(ctx, f, 3, containerName, 7) + }) + }) + + ginkgo.Context("Alternate container restart backs off as expected", func() { + + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(30 * time.Second)} + initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true} + }) + + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + ginkgo.It("Alternate restart backs off.", func(ctx context.Context) { + // 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s + doTest(ctx, f, 3, containerName, 13) + }) + }) +}) + +func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) { + + pod := e2epod.NewPodClient(f).Create(ctx, newFailAlwaysPod()) + podErr := e2epod.WaitForPodContainerToFail(ctx, f.ClientSet, f.Namespace.Name, pod.Name, 0, "CrashLoopBackOff", 1*time.Minute) + gomega.Expect(podErr).To(gomega.HaveOccurred()) + + // Hard wait 30 seconds for targetRestarts in the best case; longer timeout later will handle if infra was slow. + time.Sleep(30 * time.Second) + podErr = waitForContainerRestartedNTimes(ctx, f, f.Namespace.Name, pod.Name, containerName, 5*time.Minute, targetRestarts) + gomega.Expect(podErr).ShouldNot(gomega.HaveOccurred(), "Expected container to repeatedly back off container failures") + + r, err := extractObservedBackoff(ctx, f, pod.Name, containerName) + framework.ExpectNoError(err) + + gomega.Expect(r).Should(gomega.BeNumerically("<=", maxRestarts)) +} + +func extractObservedBackoff(ctx context.Context, f *framework.Framework, podName string, containerName string) (int32, error) { + var r int32 + pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, podName, metav1.GetOptions{}) + if err != nil { + return r, err + } + for _, statuses := range [][]v1.ContainerStatus{pod.Status.ContainerStatuses, pod.Status.InitContainerStatuses, pod.Status.EphemeralContainerStatuses} { + for _, cs := range statuses { + if cs.Name == containerName { + return cs.RestartCount, nil + } + } + } + return r, errors.Errorf("Could not find container status for container %s in pod %s", containerName, podName) +} + +func newFailAlwaysPod() *v1.Pod { + podName := "container-restart" + string(uuid.NewUUID()) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.BusyBox), + ImagePullPolicy: v1.PullIfNotPresent, + }, + }, + }, + } + return pod +} + +func waitForContainerRestartedNTimes(ctx context.Context, f *framework.Framework, namespace string, podName string, containerName string, timeout time.Duration, target int) error { + conditionDesc := fmt.Sprintf("A container in pod %s restarted at least %d times", podName, target) + return e2epod.WaitForPodCondition(ctx, f.ClientSet, namespace, podName, conditionDesc, timeout, func(pod *v1.Pod) (bool, error) { + cs, found := podv1util.GetContainerStatus(pod.Status.ContainerStatuses, containerName) + if !found { + return false, fmt.Errorf("could not find container %s in pod %s", containerName, podName) + } + return cs.RestartCount >= int32(target), nil + }) +} diff --git a/test/e2e_node/criproxy_test.go b/test/e2e_node/criproxy_test.go index 949a2379cbc..63339f321f5 100644 --- a/test/e2e_node/criproxy_test.go +++ b/test/e2e_node/criproxy_test.go @@ -84,6 +84,20 @@ var _ = SIGDescribe(feature.CriProxy, framework.WithSerial(), func() { }) }) + ginkgo.Context("Image pull backoff", func() { + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + }) + ginkgo.Context("Inject a pull image timeout exception into the CriProxy", func() { ginkgo.BeforeEach(func() { if err := resetCRIProxyInjector(e2eCriProxy); err != nil { diff --git a/test/e2e_node/image_pull_test.go b/test/e2e_node/image_pull_test.go index 21e4a113a88..a80795f8f37 100644 --- a/test/e2e_node/image_pull_test.go +++ b/test/e2e_node/image_pull_test.go @@ -34,6 +34,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubeletevents "k8s.io/kubernetes/pkg/kubelet/events" + "k8s.io/kubernetes/pkg/kubelet/images" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" @@ -230,6 +231,44 @@ var _ = SIGDescribe("Pull Image", feature.CriProxy, framework.WithSerial(), func }) }) + + ginkgo.It("Image pull retry backs off on error.", func(ctx context.Context) { + // inject PullImage failed to trigger backoff + expectedErr := fmt.Errorf("PullImage failed") + err := addCRIProxyInjector(e2eCriProxy, func(apiName string) error { + if apiName == criproxy.PullImage { + return expectedErr + } + return nil + }) + framework.ExpectNoError(err) + + pod := e2epod.NewPodClient(f).Create(ctx, newPullImageAlwaysPod()) + podErr := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "ImagePullBackOff", 1*time.Minute, func(pod *v1.Pod) (bool, error) { + if len(pod.Status.ContainerStatuses) > 0 && pod.Status.Reason == images.ErrImagePullBackOff.Error() { + return true, nil + } + return false, nil + }) + gomega.Expect(podErr).To(gomega.HaveOccurred()) + + eventMsg, err := getFailedToPullImageMsg(ctx, f, pod.Name) + framework.ExpectNoError(err) + isExpectedErrMsg := strings.Contains(eventMsg, expectedErr.Error()) + gomega.Expect(isExpectedErrMsg).To(gomega.BeTrueBecause("we injected an exception into the PullImage interface of the cri proxy")) + + // Hard wait 30 seconds for image pulls to repeatedly back off. + time.Sleep(30 * time.Second) + + e, err := getImagePullAttempts(ctx, f, pod.Name) + framework.ExpectNoError(err) + // 3 would take 10s best case. + gomega.Expect(e.Count).Should(gomega.BeNumerically(">=", 3)) + // 7 would take 310s best case, if the infra went slow. + gomega.Expect(e.Count).Should(gomega.BeNumerically("<=", 7)) + + }) + }) func getPodImagePullDurations(ctx context.Context, f *framework.Framework, testpods []*v1.Pod) (map[string]*pulledStruct, map[string]metav1.Time, map[string]metav1.Time, error) { @@ -343,3 +382,18 @@ func getDurationsFromPulledEventMsg(msg string) (*pulledStruct, error) { pulledIncludeWaitingDuration: pulledIncludeWaitingDuration, }, nil } + +func getImagePullAttempts(ctx context.Context, f *framework.Framework, podName string) (v1.Event, error) { + event := v1.Event{} + e, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) + if err != nil { + return event, err + } + + for _, event := range e.Items { + if event.InvolvedObject.Name == podName && event.Reason == kubeletevents.PullingImage { + return event, nil + } + } + return event, nil +}