Merge pull request #41436 from dashpole/status_bug

Automatic merge from submit-queue Fix bug in status manager TerminatePod In TerminatePod, we previously pass pod.Status to updateStatusInternal. This is a bug, since it is the original status that we are given. Not only does it skip updates made to container statuses, but in some cases it reverted the pod's status to an earlier version, since it was being passed a stale status initially. This was the case in #40239 and #41095. As shown in #40239, the pod's status is set to running after it is set to failed, occasionally causing very long delays in pod deletion since we have to wait for this to be corrected. This PR fixes the bug, adds some helpful debugging statements, and adds a unit test for TerminatePod (which for some reason didnt exist before?). @kubernetes/sig-node-bugs @vish @Random-Liu
2025-07-29 14:37:00 +00:00 · 2017-02-14 21:03:31 -08:00 · 2017-02-14 21:03:31 -08:00 · a57967f47b
commit a57967f47b
parent a197fbdf2d c612e09acd
2 changed files with 34 additions and 1 deletions
--- a/pkg/kubelet/status/status_manager.go
+++ b/pkg/kubelet/status/status_manager.go
@ -135,6 +135,8 @@ func (m *manager) Start() {
 	go wait.Forever(func() {
 		select {
 		case syncRequest := <-m.podStatusChannel:
+			glog.V(5).Infof("Status Manager: syncing pod: %q, with status: (%d, %v) from podStatusChannel",
+				syncRequest.podUID, syncRequest.status.version, syncRequest.status.status)
 			m.syncPod(syncRequest.podUID, syncRequest.status)
 		case <-syncTicker:
 			m.syncBatch()
@ -260,7 +262,7 @@ func (m *manager) TerminatePod(pod *v1.Pod) {
 			Terminated: &v1.ContainerStateTerminated{},
 		}
 	}
-	m.updateStatusInternal(pod, pod.Status, true)
+	m.updateStatusInternal(pod, status, true)
 }

 // updateStatusInternal updates the internal status cache, and queues an update to the api server if
@ -326,6 +328,8 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp

 	select {
 	case m.podStatusChannel <- podStatusSyncRequest{pod.UID, newStatus}:
+		glog.V(5).Infof("Status Manager: adding pod: %q, with status: (%q, %v) to podStatusChannel",
+			pod.UID, newStatus.version, newStatus.status)
 		return true
 	default:
 		// Let the periodic syncBatch handle the update if the channel is full.
@ -395,6 +399,7 @@ func (m *manager) syncBatch() {
 	}()

 	for _, update := range updatedStatuses {
+		glog.V(5).Infof("Status Manager: syncPod in syncbatch. pod UID: %q", update.podUID)
 		m.syncPod(update.podUID, update.status)
 	}
 }
--- a/pkg/kubelet/status/status_manager_test.go
+++ b/pkg/kubelet/status/status_manager_test.go
@ -583,6 +583,34 @@ func TestStaticPod(t *testing.T) {
 	})
 }

+func TestTerminatePod(t *testing.T) {
+	syncer := newTestManager(&fake.Clientset{})
+	testPod := getTestPod()
+	// update the pod's status to Failed.  TerminatePod should preserve this status update.
+	firstStatus := getRandomPodStatus()
+	firstStatus.Phase = v1.PodFailed
+	syncer.SetPodStatus(testPod, firstStatus)
+
+	// set the testPod to a pod with Phase running, to simulate a stale pod
+	testPod.Status = getRandomPodStatus()
+	testPod.Status.Phase = v1.PodRunning
+
+	syncer.TerminatePod(testPod)
+
+	// we expect the container statuses to have changed to terminated
+	newStatus := expectPodStatus(t, syncer, testPod)
+	for i := range newStatus.ContainerStatuses {
+		assert.False(t, newStatus.ContainerStatuses[i].State.Terminated == nil, "expected containers to be terminated")
+	}
+	for i := range newStatus.InitContainerStatuses {
+		assert.False(t, newStatus.InitContainerStatuses[i].State.Terminated == nil, "expected init containers to be terminated")
+	}
+
+	// we expect the previous status update to be preserved.
+	assert.Equal(t, newStatus.Phase, firstStatus.Phase)
+	assert.Equal(t, newStatus.Message, firstStatus.Message)
+}
+
 func TestSetContainerReadiness(t *testing.T) {
 	cID1 := kubecontainer.ContainerID{Type: "test", ID: "1"}
 	cID2 := kubecontainer.ContainerID{Type: "test", ID: "2"}