Fix startup probe worker termination for sidecar containers

Fixes a bug where startup probe workers terminate incorrectly for sidecar
containers with restartPolicy=Always when the pod has restartPolicy=Never,
causing main containers to remain stuck in Initializing state.

Changes:
- Add container-level restart policy check for init containers only
- Extract complex boolean logic to named variable for readability
- Refactor test helper to use existing newWorker() function
- Add comprehensive unit and e2e tests for both scenarios
This commit is contained in:
aditya
2025-08-17 11:46:09 +05:30
committed by Yuan Wang
parent e88161a575
commit 888c785092
3 changed files with 207 additions and 1 deletions

View File

@@ -81,6 +81,16 @@ type worker struct {
proberDurationUnknownMetricLabels metrics.Labels
}
// isInitContainer checks if the worker's container is in the pod's init containers
func (w *worker) isInitContainer() bool {
for _, initContainer := range w.pod.Spec.InitContainers {
if initContainer.Name == w.container.Name {
return true
}
}
return false
}
// Creates and starts a new probe worker.
func newWorker(
m *manager,
@@ -251,9 +261,14 @@ func (w *worker) doProbe(ctx context.Context) (keepGoing bool) {
if !w.containerID.IsEmpty() {
w.resultsManager.Set(w.containerID, results.Failure, w.pod)
}
isRestartableInitContainer := w.isInitContainer() &&
w.container.RestartPolicy != nil && *w.container.RestartPolicy == v1.ContainerRestartPolicyAlways
// Abort if the container will not be restarted.
return c.State.Terminated == nil ||
w.pod.Spec.RestartPolicy != v1.RestartPolicyNever
w.pod.Spec.RestartPolicy != v1.RestartPolicyNever ||
isRestartableInitContainer
}
// Graceful shutdown of the pod.

View File

@@ -37,6 +37,25 @@ import (
func init() {
}
// newTestWorkerWithRestartableInitContainer creates a test worker with an init container setup
func newTestWorkerWithRestartableInitContainer(m *manager, probeType probeType) *worker {
pod := getTestPod()
// Set up init container with restart policy
initContainer := pod.Spec.Containers[0]
initContainer.Name = testContainerName
restartPolicy := v1.ContainerRestartPolicyAlways
initContainer.RestartPolicy = &restartPolicy
// Move container to init containers and add a regular container
pod.Spec.InitContainers = []v1.Container{initContainer}
pod.Spec.Containers = []v1.Container{{
Name: "main-container",
}}
return newWorker(m, probeType, pod, initContainer)
}
func TestDoProbe(t *testing.T) {
m := newTestManager()
@@ -529,6 +548,72 @@ func TestResultRunOnStartupCheckFailure(t *testing.T) {
}
}
func TestDoProbe_TerminatedRestartableInitContainerWithRestartPolicyNever(t *testing.T) {
ctx := context.Background()
m := newTestManager()
// Test restartable init container (sidecar) behavior
w := newTestWorkerWithRestartableInitContainer(m, startup)
// Set pod restart policy to Never
w.pod.Spec.RestartPolicy = v1.RestartPolicyNever
// Create terminated status for init container
terminatedStatus := getTestRunningStatus()
terminatedStatus.InitContainerStatuses = []v1.ContainerStatus{{
Name: testContainerName,
ContainerID: "test://test_container_id",
State: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
StartedAt: metav1.Now(),
},
},
}}
terminatedStatus.ContainerStatuses[0].State.Running = nil
m.statusManager.SetPodStatus(w.pod, terminatedStatus)
// Test: Terminated restartable init container with restartPolicy=Always should continue probing
// even when pod has restartPolicy=Never
if !w.doProbe(ctx) {
t.Error("Expected to continue probing for terminated restartable init container with pod restart policy Never")
}
// Verify result is set to Failure for terminated container
expectResult(t, w, results.Failure, "restartable init container with pod restart policy Never")
}
func TestDoProbe_TerminatedContainerWithRestartPolicyNever(t *testing.T) {
ctx := context.Background()
m := newTestManager()
// Test that regular containers (without RestartPolicy) still work as before
w := newTestWorker(m, startup, v1.Probe{})
// Regular container without explicit restart policy
w.container.RestartPolicy = nil
// Set pod restart policy to Never
w.pod.Spec.RestartPolicy = v1.RestartPolicyNever
// Create terminated status
terminatedStatus := getTestRunningStatus()
terminatedStatus.ContainerStatuses[0].State.Running = nil
terminatedStatus.ContainerStatuses[0].State.Terminated = &v1.ContainerStateTerminated{
StartedAt: metav1.Now(),
}
m.statusManager.SetPodStatus(w.pod, terminatedStatus)
// Should stop probing (existing behavior preserved)
if w.doProbe(ctx) {
t.Error("Expected to stop probing for regular container with pod RestartPolicy=Never")
}
// Verify result is set to Failure for terminated container
expectResult(t, w, results.Failure, "regular container with pod restart policy Never")
}
func TestLivenessProbeDisabledByStarted(t *testing.T) {
ctx := context.Background()
m := newTestManager()

View File

@@ -5408,7 +5408,113 @@ var _ = SIGDescribe(feature.SidecarContainers, "Containers Lifecycle", func() {
})
})
})
})
ginkgo.When("A restartable init container with startup probe fails initially", func() {
ginkgo.It("should continue probing and allow regular container to start after the restartable init container recovers", func(ctx context.Context) {
restartableInit := "buggy-restartable-init"
regularContainer := "regular-container"
restartPolicyAlways := v1.ContainerRestartPolicyAlways
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "restartable-init-startup-probe-fix",
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
InitContainers: []v1.Container{{
Name: restartableInit,
Image: busyboxImage,
RestartPolicy: &restartPolicyAlways,
Command: []string{"sh", "-c", `
if [ ! -f /persistent/first_run_done ]; then
echo 'First run: creating marker and exiting with 1'
touch /persistent/first_run_done
exit 1
else
echo 'Second run: marker found, running as sidecar'
sleep 120
fi`},
StartupProbe: &v1.Probe{
InitialDelaySeconds: 3,
PeriodSeconds: 2,
FailureThreshold: 10,
ProbeHandler: v1.ProbeHandler{
Exec: &v1.ExecAction{
Command: []string{"/bin/true"},
},
},
},
}},
Containers: []v1.Container{{
Name: regularContainer,
Image: imageutils.GetPauseImageName(),
StartupProbe: &v1.Probe{
InitialDelaySeconds: 5,
PeriodSeconds: 5,
ProbeHandler: v1.ProbeHandler{
Exec: &v1.ExecAction{
Command: []string{"/bin/true"},
},
},
},
}},
},
}
preparePod(pod)
client := e2epod.NewPodClient(f)
pod = client.Create(ctx, pod)
ginkgo.By("Waiting for init container to fail and restart at least once")
framework.ExpectNoError(
e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
"restartable init restarted", 90*time.Second,
func(p *v1.Pod) (bool, error) {
for _, st := range p.Status.InitContainerStatuses {
if st.Name == restartableInit && st.RestartCount > 0 {
framework.Logf("Init container %s has restarted %d times", restartableInit, st.RestartCount)
return true, nil
}
}
return false, nil
}),
)
ginkgo.By("Waiting for init container to be running after restart")
framework.ExpectNoError(
e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
"restartable init running", 90*time.Second,
func(p *v1.Pod) (bool, error) {
for _, st := range p.Status.InitContainerStatuses {
if st.Name == restartableInit && st.State.Running != nil {
framework.Logf("Init container %s is now running", restartableInit)
return true, nil
}
}
return false, nil
}),
)
ginkgo.By("Waiting for regular container to start")
framework.ExpectNoError(
e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name,
"regular container running", 120*time.Second,
func(p *v1.Pod) (bool, error) {
for _, st := range p.Status.ContainerStatuses {
if st.Name == regularContainer && st.State.Running != nil {
framework.Logf("Regular container %s is running", regularContainer)
return true, nil
}
}
return false, nil
}),
)
})
})
})
var _ = SIGDescribe(feature.SidecarContainers, framework.WithSerial(), "Containers Lifecycle", func() {