From 7811d84fefaf0df1f68b7b94988a36c762c8e383 Mon Sep 17 00:00:00 2001
From: David Porter <porterdavid@google.com>
Date: Fri, 27 May 2022 13:21:20 -0700
Subject: [PATCH 1/2] kubelet: Mark ready condition as false explicitly for
 terminal pods

Terminal pods may continue to report a ready condition of true because
there is a delay in reconciling the ready condition of the containers
from the runtime with the pod status. It should be invalid for kubelet
to report a terminal phase with a true ready condition. To fix the
issue, explicitly override the ready condition to false for terminal
pods during status updates.

Signed-off-by: David Porter <david@porter.me>
---
 pkg/api/v1/pod/util.go                    | 13 ++++
 pkg/api/v1/pod/util_test.go               | 94 +++++++++++++++++++++++
 pkg/kubelet/status/generate.go            | 45 +++++++++--
 pkg/kubelet/status/status_manager.go      | 14 ++++
 pkg/kubelet/status/status_manager_test.go |  6 +-
 5 files changed, 165 insertions(+), 7 deletions(-)

diff --git a/pkg/api/v1/pod/util.go b/pkg/api/v1/pod/util.go
index 4ae77d543a2..8bfc21a67f4 100644
--- a/pkg/api/v1/pod/util.go
+++ b/pkg/api/v1/pod/util.go
@@ -317,6 +317,12 @@ func IsPodReadyConditionTrue(status v1.PodStatus) bool {
 	return condition != nil && condition.Status == v1.ConditionTrue
 }
 
+// IsContainersReadyConditionTrue returns true if a pod is ready; false otherwise.
+func IsContainersReadyConditionTrue(status v1.PodStatus) bool {
+	condition := GetContainersReadyCondition(status)
+	return condition != nil && condition.Status == v1.ConditionTrue
+}
+
 // GetPodReadyCondition extracts the pod ready condition from the given status and returns that.
 // Returns nil if the condition is not present.
 func GetPodReadyCondition(status v1.PodStatus) *v1.PodCondition {
@@ -324,6 +330,13 @@ func GetPodReadyCondition(status v1.PodStatus) *v1.PodCondition {
 	return condition
 }
 
+// GetContainersReadyCondition extracts the containers ready condition from the given status and returns that.
+// Returns nil if the condition is not present.
+func GetContainersReadyCondition(status v1.PodStatus) *v1.PodCondition {
+	_, condition := GetPodCondition(&status, v1.ContainersReady)
+	return condition
+}
+
 // GetPodCondition extracts the provided condition from the given status and returns that.
 // Returns nil and -1 if the condition is not present, and the index of the located condition.
 func GetPodCondition(status *v1.PodStatus, conditionType v1.PodConditionType) (int, *v1.PodCondition) {
diff --git a/pkg/api/v1/pod/util_test.go b/pkg/api/v1/pod/util_test.go
index 78b1b92f376..666f3aa0b5f 100644
--- a/pkg/api/v1/pod/util_test.go
+++ b/pkg/api/v1/pod/util_test.go
@@ -904,3 +904,97 @@ func TestUpdatePodCondition(t *testing.T) {
 		assert.Equal(t, test.expected, resultStatus, test.desc)
 	}
 }
+
+func TestGetContainersReadyCondition(t *testing.T) {
+	time := metav1.Now()
+
+	containersReadyCondition := v1.PodCondition{
+		Type:               v1.ContainersReady,
+		Status:             v1.ConditionTrue,
+		Reason:             "successfully",
+		Message:            "sync pod successfully",
+		LastProbeTime:      time,
+		LastTransitionTime: metav1.NewTime(time.Add(1000)),
+	}
+
+	tests := []struct {
+		desc              string
+		podStatus         v1.PodStatus
+		expectedCondition *v1.PodCondition
+	}{
+		{
+			desc: "containers ready condition exists",
+			podStatus: v1.PodStatus{
+				Conditions: []v1.PodCondition{containersReadyCondition},
+			},
+			expectedCondition: &containersReadyCondition,
+		},
+		{
+			desc: "containers ready condition does not exist",
+			podStatus: v1.PodStatus{
+				Conditions: []v1.PodCondition{},
+			},
+			expectedCondition: nil,
+		},
+	}
+
+	for _, test := range tests {
+		containersReadyCondition := GetContainersReadyCondition(test.podStatus)
+		assert.Equal(t, test.expectedCondition, containersReadyCondition, test.desc)
+	}
+}
+
+func TestIsContainersReadyConditionTrue(t *testing.T) {
+	time := metav1.Now()
+
+	tests := []struct {
+		desc      string
+		podStatus v1.PodStatus
+		expected  bool
+	}{
+		{
+			desc: "containers ready condition is true",
+			podStatus: v1.PodStatus{
+				Conditions: []v1.PodCondition{
+					{
+						Type:               v1.ContainersReady,
+						Status:             v1.ConditionTrue,
+						Reason:             "successfully",
+						Message:            "sync pod successfully",
+						LastProbeTime:      time,
+						LastTransitionTime: metav1.NewTime(time.Add(1000)),
+					},
+				},
+			},
+			expected: true,
+		},
+		{
+			desc: "containers ready condition is false",
+			podStatus: v1.PodStatus{
+				Conditions: []v1.PodCondition{
+					{
+						Type:               v1.ContainersReady,
+						Status:             v1.ConditionFalse,
+						Reason:             "successfully",
+						Message:            "sync pod successfully",
+						LastProbeTime:      time,
+						LastTransitionTime: metav1.NewTime(time.Add(1000)),
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			desc: "containers ready condition is empty",
+			podStatus: v1.PodStatus{
+				Conditions: []v1.PodCondition{},
+			},
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		isContainersReady := IsContainersReadyConditionTrue(test.podStatus)
+		assert.Equal(t, test.expected, isContainersReady, test.desc)
+	}
+}
diff --git a/pkg/kubelet/status/generate.go b/pkg/kubelet/status/generate.go
index ef87faa993c..024f1cf4bc2 100644
--- a/pkg/kubelet/status/generate.go
+++ b/pkg/kubelet/status/generate.go
@@ -20,7 +20,7 @@ import (
 	"fmt"
 	"strings"
 
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 )
 
@@ -29,6 +29,8 @@ const (
 	UnknownContainerStatuses = "UnknownContainerStatuses"
 	// PodCompleted says that all related containers have succeeded.
 	PodCompleted = "PodCompleted"
+	// PodFailed says that the pod has failed and as such the containers have failed.
+	PodFailed = "PodFailed"
 	// ContainersNotReady says that one or more containers are not ready.
 	ContainersNotReady = "ContainersNotReady"
 	// ContainersNotInitialized says that one or more init containers have not succeeded.
@@ -62,11 +64,12 @@ func GenerateContainersReadyCondition(spec *v1.PodSpec, containerStatuses []v1.C
 
 	// If all containers are known and succeeded, just return PodCompleted.
 	if podPhase == v1.PodSucceeded && len(unknownContainers) == 0 {
-		return v1.PodCondition{
-			Type:   v1.ContainersReady,
-			Status: v1.ConditionFalse,
-			Reason: PodCompleted,
-		}
+		return generateContainersReadyConditionForTerminalPhase(podPhase)
+	}
+
+	// If the pod phase is failed, explicitly set the ready condition to false for containers since they may be in progress of terminating.
+	if podPhase == v1.PodFailed {
+		return generateContainersReadyConditionForTerminalPhase(podPhase)
 	}
 
 	// Generate message for containers in unknown condition.
@@ -191,3 +194,33 @@ func GeneratePodInitializedCondition(spec *v1.PodSpec, containerStatuses []v1.Co
 		Status: v1.ConditionTrue,
 	}
 }
+
+func generateContainersReadyConditionForTerminalPhase(podPhase v1.PodPhase) v1.PodCondition {
+	condition := v1.PodCondition{
+		Type:   v1.ContainersReady,
+		Status: v1.ConditionFalse,
+	}
+
+	if podPhase == v1.PodFailed {
+		condition.Reason = PodFailed
+	} else if podPhase == v1.PodSucceeded {
+		condition.Reason = PodCompleted
+	}
+
+	return condition
+}
+
+func generatePodReadyConditionForTerminalPhase(podPhase v1.PodPhase) v1.PodCondition {
+	condition := v1.PodCondition{
+		Type:   v1.PodReady,
+		Status: v1.ConditionFalse,
+	}
+
+	if podPhase == v1.PodFailed {
+		condition.Reason = PodFailed
+	} else if podPhase == v1.PodSucceeded {
+		condition.Reason = PodCompleted
+	}
+
+	return condition
+}
diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go
index 845137e0656..47036de00df 100644
--- a/pkg/kubelet/status/status_manager.go
+++ b/pkg/kubelet/status/status_manager.go
@@ -867,6 +867,20 @@ func mergePodStatus(oldPodStatus, newPodStatus v1.PodStatus, couldHaveRunningCon
 		}
 	}
 
+	// If the new phase is terminal, explicitly set the ready condition to false for v1.PodReady and v1.ContainersReady.
+	// It may take some time for kubelet to reconcile the ready condition, so explicitly set ready conditions to false if the phase is terminal.
+	// This is done to ensure kubelet does not report a status update with terminal pod phase and ready=true.
+	// See https://issues.k8s.io/108594 for more details.
+	if podutil.IsPodPhaseTerminal(newPodStatus.Phase) {
+		if podutil.IsPodReadyConditionTrue(newPodStatus) || podutil.IsContainersReadyConditionTrue(newPodStatus) {
+			containersReadyCondition := generateContainersReadyConditionForTerminalPhase(newPodStatus.Phase)
+			podutil.UpdatePodCondition(&newPodStatus, &containersReadyCondition)
+
+			podReadyCondition := generatePodReadyConditionForTerminalPhase(newPodStatus.Phase)
+			podutil.UpdatePodCondition(&newPodStatus, &podReadyCondition)
+		}
+	}
+
 	return newPodStatus
 }
 
diff --git a/pkg/kubelet/status/status_manager_test.go b/pkg/kubelet/status/status_manager_test.go
index 9ecdf67b300..938b1d6ea25 100644
--- a/pkg/kubelet/status/status_manager_test.go
+++ b/pkg/kubelet/status/status_manager_test.go
@@ -1548,7 +1548,11 @@ func TestMergePodStatus(t *testing.T) {
 				Conditions: []v1.PodCondition{
 					{
 						Type:   v1.PodReady,
-						Status: v1.ConditionTrue,
+						Status: v1.ConditionFalse,
+					},
+					{
+						Type:   v1.ContainersReady,
+						Status: v1.ConditionFalse,
 					},
 					{
 						Type:   v1.PodScheduled,

From b4b338d4ebe84eb7ccbd61f07ca6352ff8e16795 Mon Sep 17 00:00:00 2001
From: David Porter <david@porter.me>
Date: Mon, 6 Jun 2022 18:12:59 -0700
Subject: [PATCH 2/2] test: update graceful node shutdown e2e with watch

Use a watch to detect invalid pod status updates in graceful node
shutdown node e2e test. By using a watch, all pod updates will be
captured while the previous logic required polling the api-server which
could miss some intermediate updates.

Signed-off-by: David Porter <david@porter.me>
---
 test/e2e_node/node_shutdown_linux_test.go | 43 ++++++++++++++++++-----
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/test/e2e_node/node_shutdown_linux_test.go b/test/e2e_node/node_shutdown_linux_test.go
index ccb31e37e5e..32ff9ccb549 100644
--- a/test/e2e_node/node_shutdown_linux_test.go
+++ b/test/e2e_node/node_shutdown_linux_test.go
@@ -28,7 +28,11 @@ import (
 
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/fields"
+	"k8s.io/apimachinery/pkg/watch"
+	"k8s.io/client-go/tools/cache"
+	watchtools "k8s.io/client-go/tools/watch"
 	"k8s.io/kubectl/pkg/util/podutils"
+
 	admissionapi "k8s.io/pod-security-admission/api"
 
 	"github.com/onsi/ginkgo"
@@ -40,6 +44,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	schedulingv1 "k8s.io/api/scheduling/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/kubernetes/pkg/features"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
@@ -102,6 +107,34 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 			framework.ExpectNoError(err)
 			framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
 
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+			go func() {
+				defer ginkgo.GinkgoRecover()
+				w := &cache.ListWatch{
+					WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
+						return f.ClientSet.CoreV1().Pods(f.Namespace.Name).Watch(context.TODO(), options)
+					},
+				}
+
+				// Setup watch to continuously monitor any pod events and detect invalid pod status updates
+				_, err = watchtools.Until(ctx, list.ResourceVersion, w, func(event watch.Event) (bool, error) {
+					if pod, ok := event.Object.(*v1.Pod); ok {
+						if isPodStatusAffectedByIssue108594(pod) {
+							return false, fmt.Errorf("failing test due to detecting invalid pod status")
+						}
+						// Watch will never terminate (only when the test ends due to context cancellation)
+						return false, nil
+					}
+					return false, nil
+				})
+
+				// Ignore timeout error since the context will be explicitly cancelled and the watch will never return true
+				if err != nil && err != wait.ErrWaitTimeout {
+					framework.Failf("watch for invalid pod status failed: %v", err.Error())
+				}
+			}()
+
 			ginkgo.By("Verifying batch pods are running")
 			for _, pod := range list.Items {
 				if podReady, err := testutils.PodRunningReady(&pod); err != nil || !podReady {
@@ -125,11 +158,6 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 				framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
 
 				for _, pod := range list.Items {
-					if isPodStatusAffectedByIssue108594(&pod) {
-						framework.Logf("Detected invalid pod state for pod %q: pod status: %+v", pod.Name, pod.Status)
-						framework.Failf("failing test due to detecting invalid pod status")
-					}
-
 					if kubelettypes.IsCriticalPod(&pod) {
 						if isPodShutdown(&pod) {
 							framework.Logf("Expecting critical pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
@@ -157,10 +185,6 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 				framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
 
 				for _, pod := range list.Items {
-					if isPodStatusAffectedByIssue108594(&pod) {
-						framework.Logf("Detected invalid pod state for pod %q: pod status: %+v", pod.Name, pod.Status)
-						framework.Failf("failing test due to detecting invalid pod status")
-					}
 					if !isPodShutdown(&pod) {
 						framework.Logf("Expecting pod to be shutdown, but it's not currently: Pod: %q, Pod Status %+v", pod.Name, pod.Status)
 						return fmt.Errorf("pod should be shutdown, phase: %s", pod.Status.Phase)
@@ -171,6 +195,7 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 				// Critical pod starts shutdown after (nodeShutdownGracePeriod-nodeShutdownGracePeriodCriticalPods)
 				podStatusUpdateTimeout+(nodeShutdownGracePeriod-nodeShutdownGracePeriodCriticalPods),
 				pollInterval).Should(gomega.BeNil())
+
 		})
 
 		ginkgo.It("should be able to handle a cancelled shutdown", func() {