From f17e9117776ec8a3ef98e709a4eb586780328b3a Mon Sep 17 00:00:00 2001 From: Gunju Kim Date: Thu, 13 Jul 2023 21:38:06 +0900 Subject: [PATCH] Add restartable init container probe e2e tests --- test/e2e/common/node/container_probe.go | 970 +++++++++++++++++++++++- 1 file changed, 950 insertions(+), 20 deletions(-) diff --git a/test/e2e/common/node/container_probe.go b/test/e2e/common/node/container_probe.go index b8e29995b44..d5d714a5cd6 100644 --- a/test/e2e/common/node/container_probe.go +++ b/test/e2e/common/node/container_probe.go @@ -253,7 +253,7 @@ var _ = SIGDescribe("Probing container", func() { FailureThreshold: 1, } pod := busyBoxPodSpec(readinessProbe, nil, cmd) - runReadinessFailTest(ctx, f, pod, time.Minute) + runReadinessFailTest(ctx, f, pod, time.Minute, true) }) /* @@ -727,6 +727,762 @@ done }) }) +var _ = SIGDescribe("[NodeAlphaFeature:SidecarContainers][Feature:SidecarContainers] Probing restartable init container", func() { + f := framework.NewDefaultFramework("container-probe") + f.NamespacePodSecurityLevel = admissionapi.LevelBaseline + var podClient *e2epod.PodClient + probe := webserverProbeBuilder{} + + ginkgo.BeforeEach(func() { + podClient = e2epod.NewPodClient(f) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container readiness probe, with initial delay + Description: Create a Pod that is configured with a initial delay set on + the readiness probe. Check the Pod Start time to compare to the initial + delay. The Pod MUST be ready only after the specified initial delay. + */ + ginkgo.It("with readiness probe should not be ready before initial delay and never restart", func(ctx context.Context) { + containerName := "test-webserver" + p := podClient.Create(ctx, testWebServerSidecarPodSpec(probe.withInitialDelay().build(), nil, containerName, 80)) + framework.ExpectNoError(e2epod.WaitTimeoutForPodReadyInNamespace(ctx, f.ClientSet, p.Name, f.Namespace.Name, framework.PodStartTimeout)) + + p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + isReady, err := testutils.PodRunningReady(p) + framework.ExpectNoError(err) + if !isReady { + framework.Failf("pod %s/%s should be ready", f.Namespace.Name, p.Name) + } + + // We assume the pod became ready when the container became ready. This + // is true for a single container pod. + readyTime, err := GetTransitionTimeForReadyCondition(p) + framework.ExpectNoError(err) + startedTime, err := GetContainerStartedTime(p, containerName) + framework.ExpectNoError(err) + + framework.Logf("Container started at %v, pod became ready at %v", startedTime, readyTime) + initialDelay := probeTestInitialDelaySeconds * time.Second + if readyTime.Sub(startedTime) < initialDelay { + framework.Failf("Pod became ready before it's %v initial delay", initialDelay) + } + + restartCount := getRestartCount(p) + framework.ExpectEqual(restartCount, 0, "pod should have a restart count of 0 but got %v", restartCount) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container readiness probe, failure + Description: Create a Pod with a readiness probe that fails consistently. + When this Pod is created, then the Pod MUST never be ready, never be + running and restart count MUST be zero. + */ + ginkgo.It("with readiness probe that fails should never be ready and never restart", func(ctx context.Context) { + p := podClient.Create(ctx, testWebServerSidecarPodSpec(probe.withFailing().build(), nil, "test-webserver", 80)) + gomega.Consistently(ctx, func() (bool, error) { + p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + return podutil.IsPodReady(p), nil + }, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod should not be ready") + + p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + isReady, _ := testutils.PodRunningReady(p) + if isReady { + framework.Failf("pod %s/%s should be not ready", f.Namespace.Name, p.Name) + } + + restartCount := getRestartCount(p) + framework.ExpectEqual(restartCount, 0, "pod should have a restart count of 0 but got %v", restartCount) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using local file, restart + Description: Create a Pod with liveness probe that uses ExecAction handler + to cat /temp/health file. The Container deletes the file /temp/health after + 10 second, triggering liveness probe to fail. The Pod MUST now be killed + and restarted incrementing restart count to 1. + */ + ginkgo.It("should be restarted with a exec \"cat /tmp/health\" liveness probe", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 10; rm -rf /tmp/health; sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"cat", "/tmp/health"}), + InitialDelaySeconds: 15, + TimeoutSeconds: 5, // default 1s can be pretty aggressive in CI environments with low resources + FailureThreshold: 1, + } + pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using local file, no restart + Description: Pod is created with liveness probe that uses 'exec' command + to cat /temp/health file. Liveness probe MUST not fail to check health and + the restart count should remain 0. + */ + ginkgo.It("should *not* be restarted with a exec \"cat /tmp/health\" liveness probe", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"cat", "/tmp/health"}), + InitialDelaySeconds: 15, + TimeoutSeconds: 5, // default 1s can be pretty aggressive in CI environments with low resources + FailureThreshold: 1, + } + pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using http endpoint, restart + Description: A Pod is created with liveness probe on http endpoint + /healthz. The http handler on the /healthz will return a http error after + 10 seconds since the Pod is started. This MUST result in liveness check + failure. The Pod MUST now be killed and restarted incrementing restart + count to 1. + */ + ginkgo.It("should be restarted with a /healthz http liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: httpGetHandler("/healthz", 8080), + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using tcp socket, no restart + Description: A Pod is created with liveness probe on tcp socket 8080. The + http handler on port 8080 will return http errors after 10 seconds, but the + socket will remain open. Liveness probe MUST not fail to check health and + the restart count should remain 0. + */ + ginkgo.It("should *not* be restarted with a tcp:8080 liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: tcpSocketHandler(8080), + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe) + RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using http endpoint, multiple restarts (slow) + Description: A Pod is created with liveness probe on http endpoint + /healthz. The http handler on the /healthz will return a http error after + 10 seconds since the Pod is started. This MUST result in liveness check + failure. The Pod MUST now be killed and restarted incrementing restart + count to 1. The liveness probe must fail again after restart once the http + handler for /healthz enpoind on the Pod returns an http error after 10 + seconds from the start. Restart counts MUST increment every time health + check fails, measure up to 5 restart. + */ + ginkgo.It("should have monotonically increasing restart count", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: httpGetHandler("/healthz", 8080), + InitialDelaySeconds: 5, + FailureThreshold: 1, + } + pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe) + // ~2 minutes backoff timeouts + 4 minutes defaultObservationTimeout + 2 minutes for each pod restart + RunSidecarLivenessTest(ctx, f, pod, 5, 2*time.Minute+defaultObservationTimeout+4*2*time.Minute) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using http endpoint, failure + Description: A Pod is created with liveness probe on http endpoint '/'. + Liveness probe on this endpoint will not fail. When liveness probe does not + fail then the restart count MUST remain zero. + */ + ginkgo.It("should *not* be restarted with a /healthz http liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: httpGetHandler("/", 80), + InitialDelaySeconds: 15, + TimeoutSeconds: 5, + FailureThreshold: 5, // to accommodate nodes which are slow in bringing up containers. + } + pod := testWebServerSidecarPodSpec(nil, livenessProbe, "test-webserver", 80) + RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, container exec timeout, restart + Description: A Pod is created with liveness probe with a Exec action on the + Pod. If the liveness probe call does not return within the timeout + specified, liveness probe MUST restart the Pod. + */ + ginkgo.It("should be restarted with an exec liveness probe with timeout [MinimumKubeletVersion:1.20]", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"/bin/sh", "-c", "sleep 10"}), + InitialDelaySeconds: 15, + TimeoutSeconds: 1, + FailureThreshold: 1, + } + pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container readiness probe, container exec timeout, not ready + Description: A Pod is created with readiness probe with a Exec action on + the Pod. If the readiness probe call does not return within the timeout + specified, readiness probe MUST not be Ready. + */ + ginkgo.It("should not be ready with an exec readiness probe timeout [MinimumKubeletVersion:1.20]", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 600"} + readinessProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"/bin/sh", "-c", "sleep 10"}), + InitialDelaySeconds: 15, + TimeoutSeconds: 1, + FailureThreshold: 1, + } + pod := busyBoxSidecarPodSpec(readinessProbe, nil, cmd) + runReadinessFailTest(ctx, f, pod, time.Minute, false) + }) + + /* + Release: v1.28 + Testname: Pod restartalbe init container liveness probe, container exec timeout, restart + Description: A Pod is created with liveness probe with a Exec action on the + Pod. If the liveness probe call does not return within the timeout + specified, liveness probe MUST restart the Pod. When ExecProbeTimeout + feature gate is disabled and cluster is using dockershim, the timeout is + ignored BUT a failing liveness probe MUST restart the Pod. + */ + ginkgo.It("should be restarted with a failing exec liveness probe that took longer than the timeout", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"/bin/sh", "-c", "sleep 10 & exit 1"}), + InitialDelaySeconds: 15, + TimeoutSeconds: 1, + FailureThreshold: 1, + } + pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container http liveness probe, redirected to a local address + Description: A Pod is created with liveness probe on http endpoint + /redirect?loc=healthz. The http handler on the /redirect will redirect to + the /healthz endpoint, which will return a http error after 10 seconds + since the Pod is started. This MUST result in liveness check failure. The + Pod MUST now be killed and restarted incrementing restart count to 1. + */ + ginkgo.It("should be restarted with a local redirect http liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: httpGetHandler("/redirect?loc="+url.QueryEscape("/healthz"), 8080), + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container http liveness probe, redirected to a non-local address + Description: A Pod is created with liveness probe on http endpoint + /redirect with a redirect to http://0.0.0.0/. The http handler on the + /redirect should not follow the redirect, but instead treat it as a success + and generate an event. + */ + ginkgo.It("should *not* be restarted with a non-local redirect http liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: httpGetHandler("/redirect?loc="+url.QueryEscape("http://0.0.0.0/"), 8080), + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe) + RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout) + // Expect an event of type "ProbeWarning". + expectedEvent := fields.Set{ + "involvedObject.kind": "Pod", + "involvedObject.name": pod.Name, + "involvedObject.namespace": f.Namespace.Name, + "reason": events.ContainerProbeWarning, + }.AsSelector().String() + framework.ExpectNoError(e2eevents.WaitTimeoutForEvent( + ctx, f.ClientSet, f.Namespace.Name, expectedEvent, "Probe terminated redirects, Response body: Found.", framework.PodEventTimeout)) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container startup probe restart + Description: A Pod is created with a failing startup probe. The Pod MUST be + killed and restarted incrementing restart count to 1, even if liveness + would succeed. + */ + ginkgo.It("should be restarted startup probe fails", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/true"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + startupProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/false"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 3, + } + pod := startupSidecarPodSpec(startupProbe, nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe delayed (long) by startup probe + Description: A Pod is created with failing liveness and startup probes. + Liveness probe MUST NOT fail until startup probe expires. + */ + ginkgo.It("should *not* be restarted by liveness probe because startup probe delays it", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/false"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + startupProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/false"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 60, + } + pod := startupSidecarPodSpec(startupProbe, nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe fails after startup success + Description: A Pod is created with failing liveness probe and delayed + startup probe that uses 'exec' command to cat /tmp/health file. The + Container is started by creating /tmp/startup after 10 seconds, triggering + liveness probe to fail. The Pod MUST not be killed and restarted + incrementing restart count to 1. + */ + ginkgo.It("should be restarted by liveness probe after startup probe enables it", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 10; echo ok >/tmp/startup; sleep 600"} + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/false"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + startupProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"cat", "/tmp/startup"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 60, + } + pod := startupSidecarPodSpec(startupProbe, nil, livenessProbe, cmd) + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container readiness probe, delayed by startup probe + Description: A Pod is created with startup and readiness probes. The + Container is started by creating /tmp/startup after 45 seconds, delaying + the ready state by this amount of time. This is similar to the "Pod + readiness probe, with initial delay" test. + */ + ginkgo.It("should be ready immediately after startupProbe succeeds", func(ctx context.Context) { + // Probe workers sleep at Kubelet start for a random time which is at most PeriodSeconds + // this test requires both readiness and startup workers running before updating statuses + // to avoid flakes, ensure sleep before startup (32s) > readinessProbe.PeriodSeconds + cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 32; echo ok >/tmp/startup; sleep 600"} + readinessProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"/bin/cat", "/tmp/health"}), + InitialDelaySeconds: 0, + PeriodSeconds: 30, + } + startupProbe := &v1.Probe{ + ProbeHandler: execHandler([]string{"/bin/cat", "/tmp/startup"}), + InitialDelaySeconds: 0, + FailureThreshold: 120, + PeriodSeconds: 5, + } + p := podClient.Create(ctx, startupSidecarPodSpec(startupProbe, readinessProbe, nil, cmd)) + + p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + err = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, p.Name, 0, framework.PodStartTimeout) + framework.ExpectNoError(err) + startedTime := time.Now() + + // We assume the pod became ready when the container became ready. This + // is true for a single container pod. + err = e2epod.WaitTimeoutForPodReadyInNamespace(ctx, f.ClientSet, p.Name, f.Namespace.Name, framework.PodStartTimeout) + framework.ExpectNoError(err) + readyTime := time.Now() + + p, err = podClient.Get(ctx, p.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + isReady, err := testutils.PodRunningReady(p) + framework.ExpectNoError(err) + if !isReady { + framework.Failf("pod %s/%s should be ready", f.Namespace.Name, p.Name) + } + + readyIn := readyTime.Sub(startedTime) + framework.Logf("Container started at %v, pod became ready at %v, %v after startupProbe succeeded", startedTime, readyTime, readyIn) + if readyIn < 0 { + framework.Failf("Pod became ready before startupProbe succeeded") + } + if readyIn > 25*time.Second { + framework.Failf("Pod became ready in %v, more than 25s after startupProbe succeeded. It means that the delay readiness probes were not initiated immediately after startup finished.", readyIn) + } + }) + + // TODO: Update tests after implementing termination ordering of restartable + // init containers + /* + Release: v1.28 + Testname: Set terminationGracePeriodSeconds for livenessProbe of restartable init container + Description: A pod with a long terminationGracePeriod is created with a + shorter livenessProbe-level terminationGracePeriodSeconds. We confirm the + shorter termination period is used. + */ + ginkgo.It("should override timeoutGracePeriodSeconds when LivenessProbe field is set", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 1000"} + // probe will fail since pod has no http endpoints + shortGracePeriod := int64(5) + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + HTTPGet: &v1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromInt(8080), + }, + }, + InitialDelaySeconds: 10, + FailureThreshold: 1, + TerminationGracePeriodSeconds: &shortGracePeriod, + } + pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd) + longGracePeriod := int64(500) + pod.Spec.TerminationGracePeriodSeconds = &longGracePeriod + + // 10s delay + 10s period + 5s grace period = 25s < 30s << pod-level timeout 500 + // add defaultObservationTimeout(4min) more for kubelet syncing information + // to apiserver + RunSidecarLivenessTest(ctx, f, pod, 1, time.Second*40+defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Set terminationGracePeriodSeconds for startupProbe of restartable init container + Description: A pod with a long terminationGracePeriod is created with a + shorter startupProbe-level terminationGracePeriodSeconds. We confirm the + shorter termination period is used. + */ + ginkgo.It("should override timeoutGracePeriodSeconds when StartupProbe field is set", func(ctx context.Context) { + cmd := []string{"/bin/sh", "-c", "sleep 1000"} + // startup probe will fail since pod will sleep for 1000s before becoming ready + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/true"}, + }, + }, + InitialDelaySeconds: 15, + FailureThreshold: 1, + } + pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd) + longGracePeriod := int64(500) + pod.Spec.TerminationGracePeriodSeconds = &longGracePeriod + + shortGracePeriod := int64(5) + pod.Spec.InitContainers[0].StartupProbe = &v1.Probe{ + ProbeHandler: execHandler([]string{"/bin/cat", "/tmp/startup"}), + InitialDelaySeconds: 10, + FailureThreshold: 1, + TerminationGracePeriodSeconds: &shortGracePeriod, + } + + // 10s delay + 10s period + 5s grace period = 25s < 30s << pod-level timeout 500 + // add defaultObservationTimeout(4min) more for kubelet syncing information + // to apiserver + RunSidecarLivenessTest(ctx, f, pod, 1, time.Second*40+defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using grpc call, success + Description: A Pod is created with liveness probe on grpc service. Liveness + probe on this endpoint will not fail. When liveness probe does not fail + then the restart count MUST remain zero. + */ + ginkgo.It("should *not* be restarted with a GRPC liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + GRPC: &v1.GRPCAction{ + Port: 5000, + Service: nil, + }, + }, + InitialDelaySeconds: probeTestInitialDelaySeconds, + TimeoutSeconds: 5, // default 1s can be pretty aggressive in CI environments with low resources + FailureThreshold: 1, + } + + pod := gRPCServerSidecarPodSpec(nil, livenessProbe, "agnhost") + RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout) + }) + + /* + Release: v1.28 + Testname: Pod restartable init container liveness probe, using grpc call, failure + Description: A Pod is created with liveness probe on grpc service. + Liveness probe on this endpoint should fail because of wrong probe port. + When liveness probe does fail then the restart count should +1. + */ + ginkgo.It("should be restarted with a GRPC liveness probe", func(ctx context.Context) { + livenessProbe := &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + GRPC: &v1.GRPCAction{ + Port: 2333, // this port is wrong + }, + }, + InitialDelaySeconds: probeTestInitialDelaySeconds * 4, + TimeoutSeconds: 5, // default 1s can be pretty aggressive in CI environments with low resources + FailureThreshold: 1, + } + pod := gRPCServerSidecarPodSpec(nil, livenessProbe, "agnhost") + RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout) + }) + + ginkgo.It("should mark readiness on pods to false while pod is in progress of terminating when a pod has a readiness probe", func(ctx context.Context) { + podName := "probe-test-" + string(uuid.NewUUID()) + podClient := e2epod.NewPodClient(f) + terminationGracePeriod := int64(30) + script := ` +_term() { + rm -f /tmp/ready + sleep 30 + exit 0 +} +trap _term SIGTERM + +touch /tmp/ready + +while true; do + echo \"hello\" + sleep 10 +done + ` + + // Create Pod + podClient.Create(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Name: podName, + Command: []string{"/bin/bash"}, + Args: []string{"-c", script}, + ReadinessProbe: &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"cat", "/tmp/ready"}, + }, + }, + FailureThreshold: 1, + InitialDelaySeconds: 5, + PeriodSeconds: 2, + }, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + TerminationGracePeriodSeconds: &terminationGracePeriod, + }, + }) + + // verify pods are running and ready + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + framework.ExpectNoError(err) + + // Shutdown pod. Readiness should change to false + err = podClient.Delete(ctx, podName, metav1.DeleteOptions{}) + framework.ExpectNoError(err) + + err = waitForPodStatusByInformer(ctx, f.ClientSet, f.Namespace.Name, podName, f.Timeouts.PodDelete, func(pod *v1.Pod) (bool, error) { + if !podutil.IsPodReady(pod) { + return true, nil + } + framework.Logf("pod %s/%s is still ready, waiting until is not ready", pod.Namespace, pod.Name) + return false, nil + }) + framework.ExpectNoError(err) + }) + + ginkgo.It("should mark readiness on pods to false and disable liveness probes while pod is in progress of terminating", func(ctx context.Context) { + podName := "probe-test-" + string(uuid.NewUUID()) + podClient := e2epod.NewPodClient(f) + terminationGracePeriod := int64(30) + script := ` +_term() { + rm -f /tmp/ready + rm -f /tmp/liveness + sleep 20 + exit 0 +} +trap _term SIGTERM + +touch /tmp/ready +touch /tmp/liveness + +while true; do + echo \"hello\" + sleep 10 +done +` + + // Create Pod + podClient.Create(ctx, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Name: podName, + Command: []string{"/bin/bash"}, + Args: []string{"-c", script}, + ReadinessProbe: &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"cat", "/tmp/ready"}, + }, + }, + FailureThreshold: 1, + // delay startup to make sure the script script has + // time to create the ready+liveness files + InitialDelaySeconds: 5, + PeriodSeconds: 2, + }, + LivenessProbe: &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"cat", "/tmp/liveness"}, + }, + }, + FailureThreshold: 1, + // delay startup to make sure the script script has + // time to create the ready+liveness files + InitialDelaySeconds: 5, + PeriodSeconds: 1, + }, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + TerminationGracePeriodSeconds: &terminationGracePeriod, + }, + }) + + // verify pods are running and ready + err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) + framework.ExpectNoError(err) + + // Shutdown pod. Readiness should change to false + err = podClient.Delete(ctx, podName, metav1.DeleteOptions{}) + framework.ExpectNoError(err) + + // Wait for pod to go unready + err = waitForPodStatusByInformer(ctx, f.ClientSet, f.Namespace.Name, podName, f.Timeouts.PodDelete, func(pod *v1.Pod) (bool, error) { + if !podutil.IsPodReady(pod) { + return true, nil + } + framework.Logf("pod %s/%s is still ready, waiting until is not ready", pod.Namespace, pod.Name) + return false, nil + }) + framework.ExpectNoError(err) + + // Verify there are zero liveness failures since they are turned off + // during pod termination + gomega.Consistently(ctx, func(ctx context.Context) (bool, error) { + items, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, event := range items.Items { + // Search only for the pod we are interested in + if event.InvolvedObject.Name != podName { + continue + } + if strings.Contains(event.Message, "failed liveness probe") { + return true, errors.New("should not see liveness probe failures") + } + } + return false, nil + }, 1*time.Minute, framework.Poll).ShouldNot(gomega.BeTrue(), "should not see liveness probes") + }) +}) + // waitForPodStatusByInformer waits pod status change by informer func waitForPodStatusByInformer(ctx context.Context, c clientset.Interface, podNamespace, podName string, timeout time.Duration, condition func(pod *v1.Pod) (bool, error)) error { // TODO (pohly): rewrite with gomega.Eventually to get intermediate progress reports. @@ -793,7 +1549,7 @@ func newInformerWatchPod(ctx context.Context, c clientset.Interface, podNamespac // GetContainerStartedTime returns the time when the given container started and error if any func GetContainerStartedTime(p *v1.Pod, containerName string) (time.Time, error) { - for _, status := range p.Status.ContainerStatuses { + for _, status := range append(p.Status.InitContainerStatuses, p.Status.ContainerStatuses...) { if status.Name != containerName { continue } @@ -817,7 +1573,7 @@ func GetTransitionTimeForReadyCondition(p *v1.Pod) (time.Time, error) { func getRestartCount(p *v1.Pod) int { count := 0 - for _, containerStatus := range p.Status.ContainerStatuses { + for _, containerStatus := range append(p.Status.InitContainerStatuses, p.Status.ContainerStatuses...) { count += int(containerStatus.RestartCount) } return count @@ -943,12 +1699,22 @@ func (b webserverProbeBuilder) build() *v1.Probe { return probe } -// RunLivenessTest verifies the number of restarts for pod with given expected number of restarts func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration) { - podClient := e2epod.NewPodClient(f) - ns := f.Namespace.Name gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty()) containerName := pod.Spec.Containers[0].Name + runLivenessTest(ctx, f, pod, expectNumRestarts, timeout, containerName) +} + +func RunSidecarLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration) { + gomega.Expect(pod.Spec.InitContainers).NotTo(gomega.BeEmpty()) + containerName := pod.Spec.InitContainers[0].Name + runLivenessTest(ctx, f, pod, expectNumRestarts, timeout, containerName) +} + +// RunLivenessTest verifies the number of restarts for pod with given expected number of restarts +func runLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration, containerName string) { + podClient := e2epod.NewPodClient(f) + ns := f.Namespace.Name // At the end of the test, clean up by removing the pod. ginkgo.DeferCleanup(func(ctx context.Context) error { ginkgo.By("deleting the pod") @@ -957,18 +1723,24 @@ func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, e ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns)) podClient.Create(ctx, pod) - // Wait until the pod is not pending. (Here we need to check for something other than - // 'Pending' other than checking for 'Running', since when failures occur, we go to - // 'Terminated' which can cause indefinite blocking.) - framework.ExpectNoError(e2epod.WaitForPodNotPending(ctx, f.ClientSet, ns, pod.Name), - fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns)) - framework.Logf("Started pod %s in namespace %s", pod.Name, ns) + // To check for the container is ever started, we need to wait for the + // container to be in a non-waiting state. + framework.ExpectNoError(e2epod.WaitForPodCondition(ctx, f.ClientSet, ns, pod.Name, "container not waiting", timeout, func(pod *v1.Pod) (bool, error) { + for _, c := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) { + if c.Name == containerName { + if c.State.Running != nil || c.State.Terminated != nil { + return true, nil + } + } + } + return false, nil + })) // Check the pod's current state and verify that restartCount is present. ginkgo.By("checking the pod's current state and verifying that restartCount is present") pod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{}) framework.ExpectNoError(err, fmt.Sprintf("getting pod %s in namespace %s", pod.Name, ns)) - initialRestartCount := podutil.GetExistingContainerStatus(pod.Status.ContainerStatuses, containerName).RestartCount + initialRestartCount := podutil.GetExistingContainerStatus(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...), containerName).RestartCount framework.Logf("Initial restart count of pod %s is %d", pod.Name, initialRestartCount) // Wait for the restart state to be as desired. @@ -981,7 +1753,7 @@ func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, e pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{}) framework.Logf("Get pod %s in namespace %s", pod.Name, ns) framework.ExpectNoError(err, fmt.Sprintf("getting pod %s", pod.Name)) - restartCount := podutil.GetExistingContainerStatus(pod.Status.ContainerStatuses, containerName).RestartCount + restartCount := podutil.GetExistingContainerStatus(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...), containerName).RestartCount if restartCount != lastRestartCount { framework.Logf("Restart count of pod %s/%s is now %d (%v elapsed)", ns, pod.Name, restartCount, time.Since(start)) @@ -1007,7 +1779,7 @@ func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, e } } -func runReadinessFailTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration) { +func runReadinessFailTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration, waitForNotPending bool) { podClient := e2epod.NewPodClient(f) ns := f.Namespace.Name gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty()) @@ -1020,11 +1792,13 @@ func runReadinessFailTest(ctx context.Context, f *framework.Framework, pod *v1.P ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns)) podClient.Create(ctx, pod) - // Wait until the pod is not pending. (Here we need to check for something other than - // 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.) - framework.ExpectNoError(e2epod.WaitForPodNotPending(ctx, f.ClientSet, ns, pod.Name), - fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns)) - framework.Logf("Started pod %s in namespace %s", pod.Name, ns) + if waitForNotPending { + // Wait until the pod is not pending. (Here we need to check for something other than + // 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.) + framework.ExpectNoError(e2epod.WaitForPodNotPending(ctx, f.ClientSet, ns, pod.Name), + fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns)) + framework.Logf("Started pod %s in namespace %s", pod.Name, ns) + } // Wait for the not ready state to be true for notReadyUntil duration deadline := time.Now().Add(notReadyUntil) @@ -1059,3 +1833,159 @@ func gRPCServerPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName st }, } } + +func testWebServerSidecarPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName string, port int) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-webserver-sidecar-" + string(uuid.NewUUID())}, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"test-webserver", "--port", fmt.Sprintf("%d", port)}, + Ports: []v1.ContainerPort{{ContainerPort: int32(port)}}, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + }, + } +} + +func busyBoxSidecarPodSpec(readinessProbe, livenessProbe *v1.Probe, cmd []string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "busybox-sidecar-" + string(uuid.NewUUID()), + Labels: map[string]string{"test": "liveness"}, + }, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Name: "busybox", + Image: imageutils.GetE2EImage(imageutils.BusyBox), + Command: cmd, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + }, + } +} + +func livenessSidecarPodSpec(namespace string, readinessProbe, livenessProbe *v1.Probe) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-liveness-sidecar-" + string(uuid.NewUUID()), + Labels: map[string]string{"test": "liveness"}, + Namespace: namespace, + }, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Name: "sidecar", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"liveness"}, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + }, + } +} + +func startupSidecarPodSpec(startupProbe, readinessProbe, livenessProbe *v1.Probe, cmd []string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "startup-sidecar-" + string(uuid.NewUUID()), + Labels: map[string]string{"test": "startup"}, + }, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Name: "sidecar", + Image: imageutils.GetE2EImage(imageutils.BusyBox), + Command: cmd, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, + StartupProbe: startupProbe, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + }, + } +} + +func gRPCServerSidecarPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName string) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-grpc-sidecar-" + string(uuid.NewUUID())}, + Spec: v1.PodSpec{ + InitContainers: []v1.Container{ + { + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Command: []string{ + "/agnhost", + "grpc-health-checking", + }, + Ports: []v1.ContainerPort{{ContainerPort: int32(5000)}, {ContainerPort: int32(8080)}}, + LivenessProbe: livenessProbe, + ReadinessProbe: readinessProbe, + RestartPolicy: func() *v1.ContainerRestartPolicy { + restartPolicy := v1.ContainerRestartPolicyAlways + return &restartPolicy + }(), + }, + }, + Containers: []v1.Container{ + { + Name: "main", + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"pause"}, + }, + }, + }, + } +}