From 888c7850921cac994da5e51b4df6acf22cd4e59c Mon Sep 17 00:00:00 2001 From: aditya Date: Sun, 17 Aug 2025 11:46:09 +0530 Subject: [PATCH] Fix startup probe worker termination for sidecar containers Fixes a bug where startup probe workers terminate incorrectly for sidecar containers with restartPolicy=Always when the pod has restartPolicy=Never, causing main containers to remain stuck in Initializing state. Changes: - Add container-level restart policy check for init containers only - Extract complex boolean logic to named variable for readability - Refactor test helper to use existing newWorker() function - Add comprehensive unit and e2e tests for both scenarios --- pkg/kubelet/prober/worker.go | 17 +++- pkg/kubelet/prober/worker_test.go | 85 +++++++++++++++++ test/e2e_node/container_lifecycle_test.go | 106 ++++++++++++++++++++++ 3 files changed, 207 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/prober/worker.go b/pkg/kubelet/prober/worker.go index aea276de6c7..4948cdb4236 100644 --- a/pkg/kubelet/prober/worker.go +++ b/pkg/kubelet/prober/worker.go @@ -81,6 +81,16 @@ type worker struct { proberDurationUnknownMetricLabels metrics.Labels } +// isInitContainer checks if the worker's container is in the pod's init containers +func (w *worker) isInitContainer() bool { + for _, initContainer := range w.pod.Spec.InitContainers { + if initContainer.Name == w.container.Name { + return true + } + } + return false +} + // Creates and starts a new probe worker. func newWorker( m *manager, @@ -251,9 +261,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) { if !w.containerID.IsEmpty() { w.resultsManager.Set(w.containerID, results.Failure, w.pod) } + + isRestartableInitContainer := w.isInitContainer() && + w.container.RestartPolicy != nil && *w.container.RestartPolicy == v1.ContainerRestartPolicyAlways + // Abort if the container will not be restarted. return c.State.Terminated == nil || - w.pod.Spec.RestartPolicy != v1.RestartPolicyNever + w.pod.Spec.RestartPolicy != v1.RestartPolicyNever || + isRestartableInitContainer } // Graceful shutdown of the pod. diff --git a/pkg/kubelet/prober/worker_test.go b/pkg/kubelet/prober/worker_test.go index 3e1fa335a16..0540646b65b 100644 --- a/pkg/kubelet/prober/worker_test.go +++ b/pkg/kubelet/prober/worker_test.go @@ -37,6 +37,25 @@ import ( func init() { } +// newTestWorkerWithRestartableInitContainer creates a test worker with an init container setup +func newTestWorkerWithRestartableInitContainer(m *manager, probeType probeType) *worker { + pod := getTestPod() + + // Set up init container with restart policy + initContainer := pod.Spec.Containers[0] + initContainer.Name = testContainerName + restartPolicy := v1.ContainerRestartPolicyAlways + initContainer.RestartPolicy = &restartPolicy + + // Move container to init containers and add a regular container + pod.Spec.InitContainers = []v1.Container{initContainer} + pod.Spec.Containers = []v1.Container{{ + Name: "main-container", + }} + + return newWorker(m, probeType, pod, initContainer) +} + func TestDoProbe(t *testing.T) { m := newTestManager() @@ -529,6 +548,72 @@ func TestResultRunOnStartupCheckFailure(t *testing.T) { } } +func TestDoProbe_TerminatedRestartableInitContainerWithRestartPolicyNever(t *testing.T) { + ctx := context.Background() + m := newTestManager() + + // Test restartable init container (sidecar) behavior + w := newTestWorkerWithRestartableInitContainer(m, startup) + + // Set pod restart policy to Never + w.pod.Spec.RestartPolicy = v1.RestartPolicyNever + + // Create terminated status for init container + terminatedStatus := getTestRunningStatus() + terminatedStatus.InitContainerStatuses = []v1.ContainerStatus{{ + Name: testContainerName, + ContainerID: "test://test_container_id", + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + StartedAt: metav1.Now(), + }, + }, + }} + terminatedStatus.ContainerStatuses[0].State.Running = nil + + m.statusManager.SetPodStatus(w.pod, terminatedStatus) + + // Test: Terminated restartable init container with restartPolicy=Always should continue probing + // even when pod has restartPolicy=Never + if !w.doProbe(ctx) { + t.Error("Expected to continue probing for terminated restartable init container with pod restart policy Never") + } + + // Verify result is set to Failure for terminated container + expectResult(t, w, results.Failure, "restartable init container with pod restart policy Never") +} + +func TestDoProbe_TerminatedContainerWithRestartPolicyNever(t *testing.T) { + ctx := context.Background() + m := newTestManager() + + // Test that regular containers (without RestartPolicy) still work as before + w := newTestWorker(m, startup, v1.Probe{}) + + // Regular container without explicit restart policy + w.container.RestartPolicy = nil + + // Set pod restart policy to Never + w.pod.Spec.RestartPolicy = v1.RestartPolicyNever + + // Create terminated status + terminatedStatus := getTestRunningStatus() + terminatedStatus.ContainerStatuses[0].State.Running = nil + terminatedStatus.ContainerStatuses[0].State.Terminated = &v1.ContainerStateTerminated{ + StartedAt: metav1.Now(), + } + + m.statusManager.SetPodStatus(w.pod, terminatedStatus) + + // Should stop probing (existing behavior preserved) + if w.doProbe(ctx) { + t.Error("Expected to stop probing for regular container with pod RestartPolicy=Never") + } + + // Verify result is set to Failure for terminated container + expectResult(t, w, results.Failure, "regular container with pod restart policy Never") +} + func TestLivenessProbeDisabledByStarted(t *testing.T) { ctx := context.Background() m := newTestManager() diff --git a/test/e2e_node/container_lifecycle_test.go b/test/e2e_node/container_lifecycle_test.go index a10d70cdbeb..82473b1f418 100644 --- a/test/e2e_node/container_lifecycle_test.go +++ b/test/e2e_node/container_lifecycle_test.go @@ -5408,7 +5408,113 @@ var _ = SIGDescribe(feature.SidecarContainers, "Containers Lifecycle", func() { }) }) }) + }) + + ginkgo.When("A restartable init container with startup probe fails initially", func() { + ginkgo.It("should continue probing and allow regular container to start after the restartable init container recovers", func(ctx context.Context) { + restartableInit := "buggy-restartable-init" + regularContainer := "regular-container" + + restartPolicyAlways := v1.ContainerRestartPolicyAlways + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "restartable-init-startup-probe-fix", + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + InitContainers: []v1.Container{{ + Name: restartableInit, + Image: busyboxImage, + RestartPolicy: &restartPolicyAlways, + Command: []string{"sh", "-c", ` +if [ ! -f /persistent/first_run_done ]; then + echo 'First run: creating marker and exiting with 1' + touch /persistent/first_run_done + exit 1 +else + echo 'Second run: marker found, running as sidecar' + sleep 120 +fi`}, + StartupProbe: &v1.Probe{ + InitialDelaySeconds: 3, + PeriodSeconds: 2, + FailureThreshold: 10, + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/true"}, + }, + }, + }, + }}, + Containers: []v1.Container{{ + Name: regularContainer, + Image: imageutils.GetPauseImageName(), + StartupProbe: &v1.Probe{ + InitialDelaySeconds: 5, + PeriodSeconds: 5, + ProbeHandler: v1.ProbeHandler{ + Exec: &v1.ExecAction{ + Command: []string{"/bin/true"}, + }, + }, + }, + }}, + }, + } + + preparePod(pod) + client := e2epod.NewPodClient(f) + pod = client.Create(ctx, pod) + + ginkgo.By("Waiting for init container to fail and restart at least once") + framework.ExpectNoError( + e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, + "restartable init restarted", 90*time.Second, + func(p *v1.Pod) (bool, error) { + for _, st := range p.Status.InitContainerStatuses { + if st.Name == restartableInit && st.RestartCount > 0 { + framework.Logf("Init container %s has restarted %d times", restartableInit, st.RestartCount) + return true, nil + } + } + return false, nil + }), + ) + + ginkgo.By("Waiting for init container to be running after restart") + framework.ExpectNoError( + e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, + "restartable init running", 90*time.Second, + func(p *v1.Pod) (bool, error) { + for _, st := range p.Status.InitContainerStatuses { + if st.Name == restartableInit && st.State.Running != nil { + framework.Logf("Init container %s is now running", restartableInit) + return true, nil + } + } + return false, nil + }), + ) + + ginkgo.By("Waiting for regular container to start") + framework.ExpectNoError( + e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, + "regular container running", 120*time.Second, + func(p *v1.Pod) (bool, error) { + for _, st := range p.Status.ContainerStatuses { + if st.Name == regularContainer && st.State.Running != nil { + framework.Logf("Regular container %s is running", regularContainer) + return true, nil + } + } + return false, nil + }), + ) + }) + }) + }) var _ = SIGDescribe(feature.SidecarContainers, framework.WithSerial(), "Containers Lifecycle", func() {