From f17e9117776ec8a3ef98e709a4eb586780328b3a Mon Sep 17 00:00:00 2001
From: Gunju Kim <gjkim042@gmail.com>
Date: Thu, 13 Jul 2023 21:38:06 +0900
Subject: [PATCH] Add restartable init container probe e2e tests

---
 test/e2e/common/node/container_probe.go | 970 +++++++++++++++++++++++-
 1 file changed, 950 insertions(+), 20 deletions(-)

diff --git a/test/e2e/common/node/container_probe.go b/test/e2e/common/node/container_probe.go
index b8e29995b44..d5d714a5cd6 100644
--- a/test/e2e/common/node/container_probe.go
+++ b/test/e2e/common/node/container_probe.go
@@ -253,7 +253,7 @@ var _ = SIGDescribe("Probing container", func() {
 			FailureThreshold:    1,
 		}
 		pod := busyBoxPodSpec(readinessProbe, nil, cmd)
-		runReadinessFailTest(ctx, f, pod, time.Minute)
+		runReadinessFailTest(ctx, f, pod, time.Minute, true)
 	})
 
 	/*
@@ -727,6 +727,762 @@ done
 	})
 })
 
+var _ = SIGDescribe("[NodeAlphaFeature:SidecarContainers][Feature:SidecarContainers] Probing restartable init container", func() {
+	f := framework.NewDefaultFramework("container-probe")
+	f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
+	var podClient *e2epod.PodClient
+	probe := webserverProbeBuilder{}
+
+	ginkgo.BeforeEach(func() {
+		podClient = e2epod.NewPodClient(f)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container readiness probe, with initial delay
+		Description: Create a Pod that is configured with a initial delay set on
+		the readiness probe. Check the Pod Start time to compare to the initial
+		delay. The Pod MUST be ready only after the specified initial delay.
+	*/
+	ginkgo.It("with readiness probe should not be ready before initial delay and never restart", func(ctx context.Context) {
+		containerName := "test-webserver"
+		p := podClient.Create(ctx, testWebServerSidecarPodSpec(probe.withInitialDelay().build(), nil, containerName, 80))
+		framework.ExpectNoError(e2epod.WaitTimeoutForPodReadyInNamespace(ctx, f.ClientSet, p.Name, f.Namespace.Name, framework.PodStartTimeout))
+
+		p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{})
+		framework.ExpectNoError(err)
+		isReady, err := testutils.PodRunningReady(p)
+		framework.ExpectNoError(err)
+		if !isReady {
+			framework.Failf("pod %s/%s should be ready", f.Namespace.Name, p.Name)
+		}
+
+		// We assume the pod became ready when the container became ready. This
+		// is true for a single container pod.
+		readyTime, err := GetTransitionTimeForReadyCondition(p)
+		framework.ExpectNoError(err)
+		startedTime, err := GetContainerStartedTime(p, containerName)
+		framework.ExpectNoError(err)
+
+		framework.Logf("Container started at %v, pod became ready at %v", startedTime, readyTime)
+		initialDelay := probeTestInitialDelaySeconds * time.Second
+		if readyTime.Sub(startedTime) < initialDelay {
+			framework.Failf("Pod became ready before it's %v initial delay", initialDelay)
+		}
+
+		restartCount := getRestartCount(p)
+		framework.ExpectEqual(restartCount, 0, "pod should have a restart count of 0 but got %v", restartCount)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container readiness probe, failure
+		Description: Create a Pod with a readiness probe that fails consistently.
+		When this Pod is created, then the Pod MUST never be ready, never be
+		running and restart count MUST be zero.
+	*/
+	ginkgo.It("with readiness probe that fails should never be ready and never restart", func(ctx context.Context) {
+		p := podClient.Create(ctx, testWebServerSidecarPodSpec(probe.withFailing().build(), nil, "test-webserver", 80))
+		gomega.Consistently(ctx, func() (bool, error) {
+			p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{})
+			if err != nil {
+				return false, err
+			}
+			return podutil.IsPodReady(p), nil
+		}, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod should not be ready")
+
+		p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{})
+		framework.ExpectNoError(err)
+
+		isReady, _ := testutils.PodRunningReady(p)
+		if isReady {
+			framework.Failf("pod %s/%s should be not ready", f.Namespace.Name, p.Name)
+		}
+
+		restartCount := getRestartCount(p)
+		framework.ExpectEqual(restartCount, 0, "pod should have a restart count of 0 but got %v", restartCount)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using local file, restart
+		Description: Create a Pod with liveness probe that uses ExecAction handler
+		to cat /temp/health file. The Container deletes the file /temp/health after
+		10 second, triggering liveness probe to fail. The Pod MUST now be killed
+		and restarted incrementing restart count to 1.
+	*/
+	ginkgo.It("should be restarted with a exec \"cat /tmp/health\" liveness probe", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 10; rm -rf /tmp/health; sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"cat", "/tmp/health"}),
+			InitialDelaySeconds: 15,
+			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
+			FailureThreshold:    1,
+		}
+		pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using local file, no restart
+		Description:  Pod is created with liveness probe that uses 'exec' command
+		to cat /temp/health file. Liveness probe MUST not fail to check health and
+		the restart count should remain 0.
+	*/
+	ginkgo.It("should *not* be restarted with a exec \"cat /tmp/health\" liveness probe", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"cat", "/tmp/health"}),
+			InitialDelaySeconds: 15,
+			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
+			FailureThreshold:    1,
+		}
+		pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using http endpoint, restart
+		Description: A Pod is created with liveness probe on http endpoint
+		/healthz. The http handler on the /healthz will return a http error after
+		10 seconds since the Pod is started. This MUST result in liveness check
+		failure. The Pod MUST now be killed and restarted incrementing restart
+		count to 1.
+	*/
+	ginkgo.It("should be restarted with a /healthz http liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        httpGetHandler("/healthz", 8080),
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using tcp socket, no restart
+		Description: A Pod is created with liveness probe on tcp socket 8080. The
+		http handler on port 8080 will return http errors after 10 seconds, but the
+		socket will remain open. Liveness probe MUST not fail to check health and
+		the restart count should remain 0.
+	*/
+	ginkgo.It("should *not* be restarted with a tcp:8080 liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        tcpSocketHandler(8080),
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe)
+		RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using http endpoint, multiple restarts (slow)
+		Description: A Pod is created with liveness probe on http endpoint
+		/healthz. The http handler on the /healthz will return a http error after
+		10 seconds since the Pod is started. This MUST result in liveness check
+		failure. The Pod MUST now be killed and restarted incrementing restart
+		count to 1. The liveness probe must fail again after restart once the http
+		handler for /healthz enpoind on the Pod returns an http error after 10
+		seconds from the start. Restart counts MUST increment every time health
+		check fails, measure up to 5 restart.
+	*/
+	ginkgo.It("should have monotonically increasing restart count", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        httpGetHandler("/healthz", 8080),
+			InitialDelaySeconds: 5,
+			FailureThreshold:    1,
+		}
+		pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe)
+		// ~2 minutes backoff timeouts + 4 minutes defaultObservationTimeout + 2 minutes for each pod restart
+		RunSidecarLivenessTest(ctx, f, pod, 5, 2*time.Minute+defaultObservationTimeout+4*2*time.Minute)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using http endpoint, failure
+		Description: A Pod is created with liveness probe on http endpoint '/'.
+		Liveness probe on this endpoint will not fail. When liveness probe does not
+		fail then the restart count MUST remain zero.
+	*/
+	ginkgo.It("should *not* be restarted with a /healthz http liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        httpGetHandler("/", 80),
+			InitialDelaySeconds: 15,
+			TimeoutSeconds:      5,
+			FailureThreshold:    5, // to accommodate nodes which are slow in bringing up containers.
+		}
+		pod := testWebServerSidecarPodSpec(nil, livenessProbe, "test-webserver", 80)
+		RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, container exec timeout, restart
+		Description: A Pod is created with liveness probe with a Exec action on the
+		Pod. If the liveness probe call does not return within the timeout
+		specified, liveness probe MUST restart the Pod.
+	*/
+	ginkgo.It("should be restarted with an exec liveness probe with timeout [MinimumKubeletVersion:1.20]", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"/bin/sh", "-c", "sleep 10"}),
+			InitialDelaySeconds: 15,
+			TimeoutSeconds:      1,
+			FailureThreshold:    1,
+		}
+		pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container readiness probe, container exec timeout, not ready
+		Description: A Pod is created with readiness probe with a Exec action on
+		the Pod. If the readiness probe call does not return within the timeout
+		specified, readiness probe MUST not be Ready.
+	*/
+	ginkgo.It("should not be ready with an exec readiness probe timeout [MinimumKubeletVersion:1.20]", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 600"}
+		readinessProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"/bin/sh", "-c", "sleep 10"}),
+			InitialDelaySeconds: 15,
+			TimeoutSeconds:      1,
+			FailureThreshold:    1,
+		}
+		pod := busyBoxSidecarPodSpec(readinessProbe, nil, cmd)
+		runReadinessFailTest(ctx, f, pod, time.Minute, false)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartalbe init container liveness probe, container exec timeout, restart
+		Description: A Pod is created with liveness probe with a Exec action on the
+		Pod. If the liveness probe call does not return within the timeout
+		specified, liveness probe MUST restart the Pod. When ExecProbeTimeout
+		feature gate is disabled and cluster is using dockershim, the timeout is
+		ignored BUT a failing liveness probe MUST restart the Pod.
+	*/
+	ginkgo.It("should be restarted with a failing exec liveness probe that took longer than the timeout", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"/bin/sh", "-c", "sleep 10 & exit 1"}),
+			InitialDelaySeconds: 15,
+			TimeoutSeconds:      1,
+			FailureThreshold:    1,
+		}
+		pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container http liveness probe, redirected to a local address
+		Description: A Pod is created with liveness probe on http endpoint
+		/redirect?loc=healthz. The http handler on the /redirect will redirect to
+		the /healthz endpoint, which will return a http error after 10 seconds
+		since the Pod is started. This MUST result in liveness check failure. The
+		Pod MUST now be killed and restarted incrementing restart count to 1.
+	*/
+	ginkgo.It("should be restarted with a local redirect http liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        httpGetHandler("/redirect?loc="+url.QueryEscape("/healthz"), 8080),
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container http liveness probe, redirected to a non-local address
+		Description: A Pod is created with liveness probe on http endpoint
+		/redirect with a redirect to http://0.0.0.0/. The http handler on the
+		/redirect should not follow the redirect, but instead treat it as a success
+		and generate an event.
+	*/
+	ginkgo.It("should *not* be restarted with a non-local redirect http liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler:        httpGetHandler("/redirect?loc="+url.QueryEscape("http://0.0.0.0/"), 8080),
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		pod := livenessSidecarPodSpec(f.Namespace.Name, nil, livenessProbe)
+		RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout)
+		// Expect an event of type "ProbeWarning".
+		expectedEvent := fields.Set{
+			"involvedObject.kind":      "Pod",
+			"involvedObject.name":      pod.Name,
+			"involvedObject.namespace": f.Namespace.Name,
+			"reason":                   events.ContainerProbeWarning,
+		}.AsSelector().String()
+		framework.ExpectNoError(e2eevents.WaitTimeoutForEvent(
+			ctx, f.ClientSet, f.Namespace.Name, expectedEvent, "Probe terminated redirects, Response body: <a href=\"http://0.0.0.0/\">Found</a>.", framework.PodEventTimeout))
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container startup probe restart
+		Description: A Pod is created with a failing startup probe. The Pod MUST be
+		killed and restarted incrementing restart count to 1, even if liveness
+		would succeed.
+	*/
+	ginkgo.It("should be restarted startup probe fails", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"/bin/true"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		startupProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"/bin/false"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    3,
+		}
+		pod := startupSidecarPodSpec(startupProbe, nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe delayed (long) by startup probe
+		Description: A Pod is created with failing liveness and startup probes.
+		Liveness probe MUST NOT fail until startup probe expires.
+	*/
+	ginkgo.It("should *not* be restarted by liveness probe because startup probe delays it", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"/bin/false"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		startupProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"/bin/false"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    60,
+		}
+		pod := startupSidecarPodSpec(startupProbe, nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe fails after startup success
+		Description: A Pod is created with failing liveness probe and delayed
+		startup probe that uses 'exec' command to cat /tmp/health file. The
+		Container is started by creating /tmp/startup after 10 seconds, triggering
+		liveness probe to fail. The Pod MUST not be killed and restarted
+		incrementing restart count to 1.
+	*/
+	ginkgo.It("should be restarted by liveness probe after startup probe enables it", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 10; echo ok >/tmp/startup; sleep 600"}
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"/bin/false"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		startupProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"cat", "/tmp/startup"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    60,
+		}
+		pod := startupSidecarPodSpec(startupProbe, nil, livenessProbe, cmd)
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container readiness probe, delayed by startup probe
+		Description: A Pod is created with startup and readiness probes. The
+		Container is started by creating /tmp/startup after 45 seconds, delaying
+		the ready state by this amount of time. This is similar to the "Pod
+		readiness probe, with initial delay" test.
+	*/
+	ginkgo.It("should be ready immediately after startupProbe succeeds", func(ctx context.Context) {
+		// Probe workers sleep at Kubelet start for a random time which is at most PeriodSeconds
+		// this test requires both readiness and startup workers running before updating statuses
+		// to avoid flakes, ensure sleep before startup (32s) > readinessProbe.PeriodSeconds
+		cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 32; echo ok >/tmp/startup; sleep 600"}
+		readinessProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"/bin/cat", "/tmp/health"}),
+			InitialDelaySeconds: 0,
+			PeriodSeconds:       30,
+		}
+		startupProbe := &v1.Probe{
+			ProbeHandler:        execHandler([]string{"/bin/cat", "/tmp/startup"}),
+			InitialDelaySeconds: 0,
+			FailureThreshold:    120,
+			PeriodSeconds:       5,
+		}
+		p := podClient.Create(ctx, startupSidecarPodSpec(startupProbe, readinessProbe, nil, cmd))
+
+		p, err := podClient.Get(ctx, p.Name, metav1.GetOptions{})
+		framework.ExpectNoError(err)
+
+		err = e2epod.WaitForPodContainerStarted(ctx, f.ClientSet, f.Namespace.Name, p.Name, 0, framework.PodStartTimeout)
+		framework.ExpectNoError(err)
+		startedTime := time.Now()
+
+		// We assume the pod became ready when the container became ready. This
+		// is true for a single container pod.
+		err = e2epod.WaitTimeoutForPodReadyInNamespace(ctx, f.ClientSet, p.Name, f.Namespace.Name, framework.PodStartTimeout)
+		framework.ExpectNoError(err)
+		readyTime := time.Now()
+
+		p, err = podClient.Get(ctx, p.Name, metav1.GetOptions{})
+		framework.ExpectNoError(err)
+
+		isReady, err := testutils.PodRunningReady(p)
+		framework.ExpectNoError(err)
+		if !isReady {
+			framework.Failf("pod %s/%s should be ready", f.Namespace.Name, p.Name)
+		}
+
+		readyIn := readyTime.Sub(startedTime)
+		framework.Logf("Container started at %v, pod became ready at %v, %v after startupProbe succeeded", startedTime, readyTime, readyIn)
+		if readyIn < 0 {
+			framework.Failf("Pod became ready before startupProbe succeeded")
+		}
+		if readyIn > 25*time.Second {
+			framework.Failf("Pod became ready in %v, more than 25s after startupProbe succeeded. It means that the delay readiness probes were not initiated immediately after startup finished.", readyIn)
+		}
+	})
+
+	// TODO: Update tests after implementing termination ordering of restartable
+	// init containers
+	/*
+		Release: v1.28
+		Testname: Set terminationGracePeriodSeconds for livenessProbe of restartable init container
+		Description: A pod with a long terminationGracePeriod is created with a
+		shorter livenessProbe-level terminationGracePeriodSeconds. We confirm the
+		shorter termination period is used.
+	*/
+	ginkgo.It("should override timeoutGracePeriodSeconds when LivenessProbe field is set", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 1000"}
+		// probe will fail since pod has no http endpoints
+		shortGracePeriod := int64(5)
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				HTTPGet: &v1.HTTPGetAction{
+					Path: "/healthz",
+					Port: intstr.FromInt(8080),
+				},
+			},
+			InitialDelaySeconds:           10,
+			FailureThreshold:              1,
+			TerminationGracePeriodSeconds: &shortGracePeriod,
+		}
+		pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd)
+		longGracePeriod := int64(500)
+		pod.Spec.TerminationGracePeriodSeconds = &longGracePeriod
+
+		// 10s delay + 10s period + 5s grace period = 25s < 30s << pod-level timeout 500
+		// add defaultObservationTimeout(4min) more for kubelet syncing information
+		// to apiserver
+		RunSidecarLivenessTest(ctx, f, pod, 1, time.Second*40+defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Set terminationGracePeriodSeconds for startupProbe of restartable init container
+		Description: A pod with a long terminationGracePeriod is created with a
+		shorter startupProbe-level terminationGracePeriodSeconds. We confirm the
+		shorter termination period is used.
+	*/
+	ginkgo.It("should override timeoutGracePeriodSeconds when StartupProbe field is set", func(ctx context.Context) {
+		cmd := []string{"/bin/sh", "-c", "sleep 1000"}
+		// startup probe will fail since pod will sleep for 1000s before becoming ready
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				Exec: &v1.ExecAction{
+					Command: []string{"/bin/true"},
+				},
+			},
+			InitialDelaySeconds: 15,
+			FailureThreshold:    1,
+		}
+		pod := busyBoxSidecarPodSpec(nil, livenessProbe, cmd)
+		longGracePeriod := int64(500)
+		pod.Spec.TerminationGracePeriodSeconds = &longGracePeriod
+
+		shortGracePeriod := int64(5)
+		pod.Spec.InitContainers[0].StartupProbe = &v1.Probe{
+			ProbeHandler:                  execHandler([]string{"/bin/cat", "/tmp/startup"}),
+			InitialDelaySeconds:           10,
+			FailureThreshold:              1,
+			TerminationGracePeriodSeconds: &shortGracePeriod,
+		}
+
+		// 10s delay + 10s period + 5s grace period = 25s < 30s << pod-level timeout 500
+		// add defaultObservationTimeout(4min) more for kubelet syncing information
+		// to apiserver
+		RunSidecarLivenessTest(ctx, f, pod, 1, time.Second*40+defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using grpc call, success
+		Description: A Pod is created with liveness probe on grpc service. Liveness
+		probe on this endpoint will not fail. When liveness probe does not fail
+		then the restart count MUST remain zero.
+	*/
+	ginkgo.It("should *not* be restarted with a GRPC liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				GRPC: &v1.GRPCAction{
+					Port:    5000,
+					Service: nil,
+				},
+			},
+			InitialDelaySeconds: probeTestInitialDelaySeconds,
+			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
+			FailureThreshold:    1,
+		}
+
+		pod := gRPCServerSidecarPodSpec(nil, livenessProbe, "agnhost")
+		RunSidecarLivenessTest(ctx, f, pod, 0, defaultObservationTimeout)
+	})
+
+	/*
+		Release: v1.28
+		Testname: Pod restartable init container liveness probe, using grpc call, failure
+		Description: A Pod is created with liveness probe on grpc service.
+		Liveness probe on this endpoint should fail because of wrong probe port.
+		When liveness probe does fail then the restart count should +1.
+	*/
+	ginkgo.It("should be restarted with a GRPC liveness probe", func(ctx context.Context) {
+		livenessProbe := &v1.Probe{
+			ProbeHandler: v1.ProbeHandler{
+				GRPC: &v1.GRPCAction{
+					Port: 2333, // this port is wrong
+				},
+			},
+			InitialDelaySeconds: probeTestInitialDelaySeconds * 4,
+			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
+			FailureThreshold:    1,
+		}
+		pod := gRPCServerSidecarPodSpec(nil, livenessProbe, "agnhost")
+		RunSidecarLivenessTest(ctx, f, pod, 1, defaultObservationTimeout)
+	})
+
+	ginkgo.It("should mark readiness on pods to false while pod is in progress of terminating when a pod has a readiness probe", func(ctx context.Context) {
+		podName := "probe-test-" + string(uuid.NewUUID())
+		podClient := e2epod.NewPodClient(f)
+		terminationGracePeriod := int64(30)
+		script := `
+_term() {
+	rm -f /tmp/ready
+	sleep 30
+	exit 0
+}
+trap _term SIGTERM
+
+touch /tmp/ready
+
+while true; do
+  echo \"hello\"
+  sleep 10
+done
+			`
+
+		// Create Pod
+		podClient.Create(ctx, &v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: podName,
+			},
+			Spec: v1.PodSpec{
+				InitContainers: []v1.Container{
+					{
+						Image:   imageutils.GetE2EImage(imageutils.Agnhost),
+						Name:    podName,
+						Command: []string{"/bin/bash"},
+						Args:    []string{"-c", script},
+						ReadinessProbe: &v1.Probe{
+							ProbeHandler: v1.ProbeHandler{
+								Exec: &v1.ExecAction{
+									Command: []string{"cat", "/tmp/ready"},
+								},
+							},
+							FailureThreshold:    1,
+							InitialDelaySeconds: 5,
+							PeriodSeconds:       2,
+						},
+						RestartPolicy: func() *v1.ContainerRestartPolicy {
+							restartPolicy := v1.ContainerRestartPolicyAlways
+							return &restartPolicy
+						}(),
+					},
+				},
+				Containers: []v1.Container{
+					{
+						Name:  "main",
+						Image: imageutils.GetE2EImage(imageutils.Agnhost),
+						Args:  []string{"pause"},
+					},
+				},
+				TerminationGracePeriodSeconds: &terminationGracePeriod,
+			},
+		})
+
+		// verify pods are running and ready
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart)
+		framework.ExpectNoError(err)
+
+		// Shutdown pod. Readiness should change to false
+		err = podClient.Delete(ctx, podName, metav1.DeleteOptions{})
+		framework.ExpectNoError(err)
+
+		err = waitForPodStatusByInformer(ctx, f.ClientSet, f.Namespace.Name, podName, f.Timeouts.PodDelete, func(pod *v1.Pod) (bool, error) {
+			if !podutil.IsPodReady(pod) {
+				return true, nil
+			}
+			framework.Logf("pod %s/%s is still ready, waiting until is not ready", pod.Namespace, pod.Name)
+			return false, nil
+		})
+		framework.ExpectNoError(err)
+	})
+
+	ginkgo.It("should mark readiness on pods to false and disable liveness probes while pod is in progress of terminating", func(ctx context.Context) {
+		podName := "probe-test-" + string(uuid.NewUUID())
+		podClient := e2epod.NewPodClient(f)
+		terminationGracePeriod := int64(30)
+		script := `
+_term() {
+	rm -f /tmp/ready
+	rm -f /tmp/liveness
+	sleep 20
+	exit 0
+}
+trap _term SIGTERM
+
+touch /tmp/ready
+touch /tmp/liveness
+
+while true; do
+  echo \"hello\"
+  sleep 10
+done
+`
+
+		// Create Pod
+		podClient.Create(ctx, &v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: podName,
+			},
+			Spec: v1.PodSpec{
+				InitContainers: []v1.Container{
+					{
+						Image:   imageutils.GetE2EImage(imageutils.Agnhost),
+						Name:    podName,
+						Command: []string{"/bin/bash"},
+						Args:    []string{"-c", script},
+						ReadinessProbe: &v1.Probe{
+							ProbeHandler: v1.ProbeHandler{
+								Exec: &v1.ExecAction{
+									Command: []string{"cat", "/tmp/ready"},
+								},
+							},
+							FailureThreshold: 1,
+							// delay startup to make sure the script script has
+							// time to create the ready+liveness files
+							InitialDelaySeconds: 5,
+							PeriodSeconds:       2,
+						},
+						LivenessProbe: &v1.Probe{
+							ProbeHandler: v1.ProbeHandler{
+								Exec: &v1.ExecAction{
+									Command: []string{"cat", "/tmp/liveness"},
+								},
+							},
+							FailureThreshold: 1,
+							// delay startup to make sure the script script has
+							// time to create the ready+liveness files
+							InitialDelaySeconds: 5,
+							PeriodSeconds:       1,
+						},
+						RestartPolicy: func() *v1.ContainerRestartPolicy {
+							restartPolicy := v1.ContainerRestartPolicyAlways
+							return &restartPolicy
+						}(),
+					},
+				},
+				Containers: []v1.Container{
+					{
+						Name:  "main",
+						Image: imageutils.GetE2EImage(imageutils.Agnhost),
+						Args:  []string{"pause"},
+					},
+				},
+				TerminationGracePeriodSeconds: &terminationGracePeriod,
+			},
+		})
+
+		// verify pods are running and ready
+		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart)
+		framework.ExpectNoError(err)
+
+		// Shutdown pod. Readiness should change to false
+		err = podClient.Delete(ctx, podName, metav1.DeleteOptions{})
+		framework.ExpectNoError(err)
+
+		// Wait for pod to go unready
+		err = waitForPodStatusByInformer(ctx, f.ClientSet, f.Namespace.Name, podName, f.Timeouts.PodDelete, func(pod *v1.Pod) (bool, error) {
+			if !podutil.IsPodReady(pod) {
+				return true, nil
+			}
+			framework.Logf("pod %s/%s is still ready, waiting until is not ready", pod.Namespace, pod.Name)
+			return false, nil
+		})
+		framework.ExpectNoError(err)
+
+		// Verify there are zero liveness failures since they are turned off
+		// during pod termination
+		gomega.Consistently(ctx, func(ctx context.Context) (bool, error) {
+			items, err := f.ClientSet.CoreV1().Events(f.Namespace.Name).List(ctx, metav1.ListOptions{})
+			framework.ExpectNoError(err)
+			for _, event := range items.Items {
+				// Search only for the pod we are interested in
+				if event.InvolvedObject.Name != podName {
+					continue
+				}
+				if strings.Contains(event.Message, "failed liveness probe") {
+					return true, errors.New("should not see liveness probe failures")
+				}
+			}
+			return false, nil
+		}, 1*time.Minute, framework.Poll).ShouldNot(gomega.BeTrue(), "should not see liveness probes")
+	})
+})
+
 // waitForPodStatusByInformer waits pod status change by informer
 func waitForPodStatusByInformer(ctx context.Context, c clientset.Interface, podNamespace, podName string, timeout time.Duration, condition func(pod *v1.Pod) (bool, error)) error {
 	// TODO (pohly): rewrite with gomega.Eventually to get intermediate progress reports.
@@ -793,7 +1549,7 @@ func newInformerWatchPod(ctx context.Context, c clientset.Interface, podNamespac
 
 // GetContainerStartedTime returns the time when the given container started and error if any
 func GetContainerStartedTime(p *v1.Pod, containerName string) (time.Time, error) {
-	for _, status := range p.Status.ContainerStatuses {
+	for _, status := range append(p.Status.InitContainerStatuses, p.Status.ContainerStatuses...) {
 		if status.Name != containerName {
 			continue
 		}
@@ -817,7 +1573,7 @@ func GetTransitionTimeForReadyCondition(p *v1.Pod) (time.Time, error) {
 
 func getRestartCount(p *v1.Pod) int {
 	count := 0
-	for _, containerStatus := range p.Status.ContainerStatuses {
+	for _, containerStatus := range append(p.Status.InitContainerStatuses, p.Status.ContainerStatuses...) {
 		count += int(containerStatus.RestartCount)
 	}
 	return count
@@ -943,12 +1699,22 @@ func (b webserverProbeBuilder) build() *v1.Probe {
 	return probe
 }
 
-// RunLivenessTest verifies the number of restarts for pod with given expected number of restarts
 func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration) {
-	podClient := e2epod.NewPodClient(f)
-	ns := f.Namespace.Name
 	gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty())
 	containerName := pod.Spec.Containers[0].Name
+	runLivenessTest(ctx, f, pod, expectNumRestarts, timeout, containerName)
+}
+
+func RunSidecarLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration) {
+	gomega.Expect(pod.Spec.InitContainers).NotTo(gomega.BeEmpty())
+	containerName := pod.Spec.InitContainers[0].Name
+	runLivenessTest(ctx, f, pod, expectNumRestarts, timeout, containerName)
+}
+
+// RunLivenessTest verifies the number of restarts for pod with given expected number of restarts
+func runLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration, containerName string) {
+	podClient := e2epod.NewPodClient(f)
+	ns := f.Namespace.Name
 	// At the end of the test, clean up by removing the pod.
 	ginkgo.DeferCleanup(func(ctx context.Context) error {
 		ginkgo.By("deleting the pod")
@@ -957,18 +1723,24 @@ func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, e
 	ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns))
 	podClient.Create(ctx, pod)
 
-	// Wait until the pod is not pending. (Here we need to check for something other than
-	// 'Pending' other than checking for 'Running', since when failures occur, we go to
-	// 'Terminated' which can cause indefinite blocking.)
-	framework.ExpectNoError(e2epod.WaitForPodNotPending(ctx, f.ClientSet, ns, pod.Name),
-		fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns))
-	framework.Logf("Started pod %s in namespace %s", pod.Name, ns)
+	// To check for the container is ever started, we need to wait for the
+	// container to be in a non-waiting state.
+	framework.ExpectNoError(e2epod.WaitForPodCondition(ctx, f.ClientSet, ns, pod.Name, "container not waiting", timeout, func(pod *v1.Pod) (bool, error) {
+		for _, c := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
+			if c.Name == containerName {
+				if c.State.Running != nil || c.State.Terminated != nil {
+					return true, nil
+				}
+			}
+		}
+		return false, nil
+	}))
 
 	// Check the pod's current state and verify that restartCount is present.
 	ginkgo.By("checking the pod's current state and verifying that restartCount is present")
 	pod, err := podClient.Get(ctx, pod.Name, metav1.GetOptions{})
 	framework.ExpectNoError(err, fmt.Sprintf("getting pod %s in namespace %s", pod.Name, ns))
-	initialRestartCount := podutil.GetExistingContainerStatus(pod.Status.ContainerStatuses, containerName).RestartCount
+	initialRestartCount := podutil.GetExistingContainerStatus(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...), containerName).RestartCount
 	framework.Logf("Initial restart count of pod %s is %d", pod.Name, initialRestartCount)
 
 	// Wait for the restart state to be as desired.
@@ -981,7 +1753,7 @@ func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, e
 		pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
 		framework.Logf("Get pod %s in namespace %s", pod.Name, ns)
 		framework.ExpectNoError(err, fmt.Sprintf("getting pod %s", pod.Name))
-		restartCount := podutil.GetExistingContainerStatus(pod.Status.ContainerStatuses, containerName).RestartCount
+		restartCount := podutil.GetExistingContainerStatus(append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...), containerName).RestartCount
 		if restartCount != lastRestartCount {
 			framework.Logf("Restart count of pod %s/%s is now %d (%v elapsed)",
 				ns, pod.Name, restartCount, time.Since(start))
@@ -1007,7 +1779,7 @@ func RunLivenessTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, e
 	}
 }
 
-func runReadinessFailTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration) {
+func runReadinessFailTest(ctx context.Context, f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration, waitForNotPending bool) {
 	podClient := e2epod.NewPodClient(f)
 	ns := f.Namespace.Name
 	gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty())
@@ -1020,11 +1792,13 @@ func runReadinessFailTest(ctx context.Context, f *framework.Framework, pod *v1.P
 	ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns))
 	podClient.Create(ctx, pod)
 
-	// Wait until the pod is not pending. (Here we need to check for something other than
-	// 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.)
-	framework.ExpectNoError(e2epod.WaitForPodNotPending(ctx, f.ClientSet, ns, pod.Name),
-		fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns))
-	framework.Logf("Started pod %s in namespace %s", pod.Name, ns)
+	if waitForNotPending {
+		// Wait until the pod is not pending. (Here we need to check for something other than
+		// 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.)
+		framework.ExpectNoError(e2epod.WaitForPodNotPending(ctx, f.ClientSet, ns, pod.Name),
+			fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns))
+		framework.Logf("Started pod %s in namespace %s", pod.Name, ns)
+	}
 
 	// Wait for the not ready state to be true for notReadyUntil duration
 	deadline := time.Now().Add(notReadyUntil)
@@ -1059,3 +1833,159 @@ func gRPCServerPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName st
 		},
 	}
 }
+
+func testWebServerSidecarPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName string, port int) *v1.Pod {
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: "test-webserver-sidecar-" + string(uuid.NewUUID())},
+		Spec: v1.PodSpec{
+			InitContainers: []v1.Container{
+				{
+					Name:           containerName,
+					Image:          imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:           []string{"test-webserver", "--port", fmt.Sprintf("%d", port)},
+					Ports:          []v1.ContainerPort{{ContainerPort: int32(port)}},
+					LivenessProbe:  livenessProbe,
+					ReadinessProbe: readinessProbe,
+					RestartPolicy: func() *v1.ContainerRestartPolicy {
+						restartPolicy := v1.ContainerRestartPolicyAlways
+						return &restartPolicy
+					}(),
+				},
+			},
+			Containers: []v1.Container{
+				{
+					Name:  "main",
+					Image: imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:  []string{"pause"},
+				},
+			},
+		},
+	}
+}
+
+func busyBoxSidecarPodSpec(readinessProbe, livenessProbe *v1.Probe, cmd []string) *v1.Pod {
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "busybox-sidecar-" + string(uuid.NewUUID()),
+			Labels: map[string]string{"test": "liveness"},
+		},
+		Spec: v1.PodSpec{
+			InitContainers: []v1.Container{
+				{
+					Name:           "busybox",
+					Image:          imageutils.GetE2EImage(imageutils.BusyBox),
+					Command:        cmd,
+					LivenessProbe:  livenessProbe,
+					ReadinessProbe: readinessProbe,
+					RestartPolicy: func() *v1.ContainerRestartPolicy {
+						restartPolicy := v1.ContainerRestartPolicyAlways
+						return &restartPolicy
+					}(),
+				},
+			},
+			Containers: []v1.Container{
+				{
+					Name:  "main",
+					Image: imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:  []string{"pause"},
+				},
+			},
+		},
+	}
+}
+
+func livenessSidecarPodSpec(namespace string, readinessProbe, livenessProbe *v1.Probe) *v1.Pod {
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-liveness-sidecar-" + string(uuid.NewUUID()),
+			Labels:    map[string]string{"test": "liveness"},
+			Namespace: namespace,
+		},
+		Spec: v1.PodSpec{
+			InitContainers: []v1.Container{
+				{
+					Name:           "sidecar",
+					Image:          imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:           []string{"liveness"},
+					LivenessProbe:  livenessProbe,
+					ReadinessProbe: readinessProbe,
+					RestartPolicy: func() *v1.ContainerRestartPolicy {
+						restartPolicy := v1.ContainerRestartPolicyAlways
+						return &restartPolicy
+					}(),
+				},
+			},
+			Containers: []v1.Container{
+				{
+					Name:  "main",
+					Image: imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:  []string{"pause"},
+				},
+			},
+		},
+	}
+}
+
+func startupSidecarPodSpec(startupProbe, readinessProbe, livenessProbe *v1.Probe, cmd []string) *v1.Pod {
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:   "startup-sidecar-" + string(uuid.NewUUID()),
+			Labels: map[string]string{"test": "startup"},
+		},
+		Spec: v1.PodSpec{
+			InitContainers: []v1.Container{
+				{
+					Name:           "sidecar",
+					Image:          imageutils.GetE2EImage(imageutils.BusyBox),
+					Command:        cmd,
+					LivenessProbe:  livenessProbe,
+					ReadinessProbe: readinessProbe,
+					StartupProbe:   startupProbe,
+					RestartPolicy: func() *v1.ContainerRestartPolicy {
+						restartPolicy := v1.ContainerRestartPolicyAlways
+						return &restartPolicy
+					}(),
+				},
+			},
+			Containers: []v1.Container{
+				{
+					Name:  "main",
+					Image: imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:  []string{"pause"},
+				},
+			},
+		},
+	}
+}
+
+func gRPCServerSidecarPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName string) *v1.Pod {
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: "test-grpc-sidecar-" + string(uuid.NewUUID())},
+		Spec: v1.PodSpec{
+			InitContainers: []v1.Container{
+				{
+					Name:  containerName,
+					Image: imageutils.GetE2EImage(imageutils.Agnhost),
+					Command: []string{
+						"/agnhost",
+						"grpc-health-checking",
+					},
+					Ports:          []v1.ContainerPort{{ContainerPort: int32(5000)}, {ContainerPort: int32(8080)}},
+					LivenessProbe:  livenessProbe,
+					ReadinessProbe: readinessProbe,
+					RestartPolicy: func() *v1.ContainerRestartPolicy {
+						restartPolicy := v1.ContainerRestartPolicyAlways
+						return &restartPolicy
+					}(),
+				},
+			},
+			Containers: []v1.Container{
+				{
+					Name:  "main",
+					Image: imageutils.GetE2EImage(imageutils.Agnhost),
+					Args:  []string{"pause"},
+				},
+			},
+		},
+	}
+}