From 9efd40d72ad5331777131e9953c1e1bab1a000ac Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 19 Jul 2021 17:54:55 -0400 Subject: [PATCH 1/3] kubelet: Preserve reason/message when phase changes The Kubelet always clears reason and message in generateAPIPodStatus even when the phase is unchanged. It is reasonable that we preserve the previous values when the phase does not change, and clear it when the phase does change. When a pod is evicted, this ensurse that the eviction message and reason are propagated even in the face of subsequent updates. It also preserves the message and reason if components beyond the Kubelet choose to set that value. To preserve the value we need to know the old phase, which requires a change to convertStatusToAPIStatus so that both methods have access to it. --- pkg/kubelet/kubelet_pods.go | 51 +++-- pkg/kubelet/kubelet_pods_test.go | 321 ++++++++++++++++++++++++++++++- 2 files changed, 353 insertions(+), 19 deletions(-) diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 87282584633..8d8069fed83 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -1378,24 +1378,40 @@ func getPhase(spec *v1.PodSpec, info []v1.ContainerStatus) v1.PodPhase { func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodStatus { klog.V(3).InfoS("Generating pod status", "pod", klog.KObj(pod)) - s := kl.convertStatusToAPIStatus(pod, podStatus) + // use the previous pod status, or the api status, as the basis for this pod + oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID) + if !found { + oldPodStatus = pod.Status + } + s := kl.convertStatusToAPIStatus(pod, podStatus, oldPodStatus) - // check if an internal module has requested the pod is evicted. + // calculate the next phase and preserve reason + allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...) + s.Phase = getPhase(&pod.Spec, allStatus) + klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "oldPhase", oldPodStatus.Phase, "phase", s.Phase) + if s.Phase == oldPodStatus.Phase { + // preserve the reason and message which is associated with the phase + s.Reason = oldPodStatus.Reason + s.Message = oldPodStatus.Message + if len(s.Reason) == 0 { + s.Reason = pod.Status.Reason + } + if len(s.Message) == 0 { + s.Message = pod.Status.Message + } + } + + // check if an internal module has requested the pod is evicted and override the reason and message for _, podSyncHandler := range kl.PodSyncHandlers { if result := podSyncHandler.ShouldEvict(pod); result.Evict { s.Phase = v1.PodFailed s.Reason = result.Reason s.Message = result.Message - return *s + break } } - // Assume info is ready to process - spec := &pod.Spec - allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...) - s.Phase = getPhase(spec, allStatus) - klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "phase", s.Phase) - // Check for illegal phase transition + // pods are not allowed to transition out of terminal phases if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded { // API server shows terminal phase; transitions are not allowed if s.Phase != pod.Status.Phase { @@ -1404,6 +1420,10 @@ func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.Po s.Phase = pod.Status.Phase } } + + spec := &pod.Spec + + // ensure the probe managers have up to date status for containers kl.probeManager.UpdatePodStatus(pod.UID, s) s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(spec, s.InitContainerStatuses, s.Phase)) s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(spec, s.Conditions, s.ContainerStatuses, s.Phase)) @@ -1466,10 +1486,10 @@ func (kl *Kubelet) sortPodIPs(podIPs []string) []string { return ips } -// convertStatusToAPIStatus creates an api PodStatus for the given pod from -// the given internal pod status. It is purely transformative and does not -// alter the kubelet state at all. -func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *v1.PodStatus { +// convertStatusToAPIStatus initialize an api PodStatus for the given pod from +// the given internal pod status and the previous state of the pod from the API. +// It is purely transformative and does not alter the kubelet state at all. +func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, oldPodStatus v1.PodStatus) *v1.PodStatus { var apiPodStatus v1.PodStatus // copy pod status IPs to avoid race conditions with PodStatus #102806 @@ -1490,11 +1510,6 @@ func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontaine // set status for Pods created on versions of kube older than 1.6 apiPodStatus.QOSClass = v1qos.GetPodQOS(pod) - oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID) - if !found { - oldPodStatus = pod.Status - } - apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses( pod, podStatus, oldPodStatus.ContainerStatuses, diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index d3b1d185421..b059864d8a4 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -31,11 +31,13 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/diff" utilfeature "k8s.io/apiserver/pkg/util/feature" core "k8s.io/client-go/testing" "k8s.io/client-go/tools/record" @@ -51,6 +53,7 @@ import ( containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" "k8s.io/kubernetes/pkg/kubelet/cri/streaming/portforward" "k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand" + "k8s.io/kubernetes/pkg/kubelet/prober/results" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/volume/util/hostutil" "k8s.io/kubernetes/pkg/volume/util/subpath" @@ -1806,10 +1809,13 @@ func TestMakeEnvironmentVariables(t *testing.T) { } func waitingState(cName string) v1.ContainerStatus { + return waitingStateWithReason(cName, "") +} +func waitingStateWithReason(cName, reason string) v1.ContainerStatus { return v1.ContainerStatus{ Name: cName, State: v1.ContainerState{ - Waiting: &v1.ContainerStateWaiting{}, + Waiting: &v1.ContainerStateWaiting{Reason: reason}, }, } } @@ -1847,6 +1853,14 @@ func runningState(cName string) v1.ContainerStatus { }, } } +func runningStateWithStartedAt(cName string, startedAt time.Time) v1.ContainerStatus { + return v1.ContainerStatus{ + Name: cName, + State: v1.ContainerState{ + Running: &v1.ContainerStateRunning{StartedAt: metav1.Time{Time: startedAt}}, + }, + } +} func stoppedState(cName string) v1.ContainerStatus { return v1.ContainerStatus{ Name: cName, @@ -1891,6 +1905,14 @@ func waitingWithLastTerminationUnknown(cName string, restartCount int32) v1.Cont RestartCount: restartCount, } } +func ready(status v1.ContainerStatus) v1.ContainerStatus { + status.Ready = true + return status +} +func withID(status v1.ContainerStatus, id string) v1.ContainerStatus { + status.ContainerID = id + return status +} func TestPodPhaseWithRestartAlways(t *testing.T) { desiredState := v1.PodSpec{ @@ -2506,6 +2528,303 @@ func TestConvertToAPIContainerStatuses(t *testing.T) { } } +func Test_generateAPIPodStatus(t *testing.T) { + desiredState := v1.PodSpec{ + NodeName: "machine", + Containers: []v1.Container{ + {Name: "containerA"}, + {Name: "containerB"}, + }, + RestartPolicy: v1.RestartPolicyAlways, + } + now := metav1.Now() + + tests := []struct { + name string + pod *v1.Pod + currentStatus *kubecontainer.PodStatus + unreadyContainer []string + previousStatus v1.PodStatus + expected v1.PodStatus + }{ + { + name: "no current status, with previous statuses and deletion", + pod: &v1.Pod{ + Spec: desiredState, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + ObjectMeta: metav1.ObjectMeta{Name: "my-pod", DeletionTimestamp: &now}, + }, + currentStatus: &kubecontainer.PodStatus{}, + previousStatus: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + expected: v1.PodStatus{ + Phase: v1.PodRunning, + HostIP: "127.0.0.1", + QOSClass: v1.PodQOSBestEffort, + Conditions: []v1.PodCondition{ + {Type: v1.PodInitialized, Status: v1.ConditionTrue}, + {Type: v1.PodReady, Status: v1.ConditionTrue}, + {Type: v1.ContainersReady, Status: v1.ConditionTrue}, + {Type: v1.PodScheduled, Status: v1.ConditionTrue}, + }, + ContainerStatuses: []v1.ContainerStatus{ + ready(waitingWithLastTerminationUnknown("containerA", 0)), + ready(waitingWithLastTerminationUnknown("containerB", 0)), + }, + }, + }, + { + name: "no current status, with previous statuses and no deletion", + pod: &v1.Pod{ + Spec: desiredState, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + }, + currentStatus: &kubecontainer.PodStatus{}, + previousStatus: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + expected: v1.PodStatus{ + Phase: v1.PodRunning, + HostIP: "127.0.0.1", + QOSClass: v1.PodQOSBestEffort, + Conditions: []v1.PodCondition{ + {Type: v1.PodInitialized, Status: v1.ConditionTrue}, + {Type: v1.PodReady, Status: v1.ConditionTrue}, + {Type: v1.ContainersReady, Status: v1.ConditionTrue}, + {Type: v1.PodScheduled, Status: v1.ConditionTrue}, + }, + ContainerStatuses: []v1.ContainerStatus{ + ready(waitingWithLastTerminationUnknown("containerA", 1)), + ready(waitingWithLastTerminationUnknown("containerB", 1)), + }, + }, + }, + { + name: "terminal phase cannot be changed (apiserver previous is succeeded)", + pod: &v1.Pod{ + Spec: desiredState, + Status: v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + }, + currentStatus: &kubecontainer.PodStatus{}, + previousStatus: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + expected: v1.PodStatus{ + Phase: v1.PodSucceeded, + HostIP: "127.0.0.1", + QOSClass: v1.PodQOSBestEffort, + Conditions: []v1.PodCondition{ + {Type: v1.PodInitialized, Status: v1.ConditionTrue, Reason: "PodCompleted"}, + {Type: v1.PodReady, Status: v1.ConditionFalse, Reason: "PodCompleted"}, + {Type: v1.ContainersReady, Status: v1.ConditionFalse, Reason: "PodCompleted"}, + {Type: v1.PodScheduled, Status: v1.ConditionTrue}, + }, + ContainerStatuses: []v1.ContainerStatus{ + ready(waitingWithLastTerminationUnknown("containerA", 1)), + ready(waitingWithLastTerminationUnknown("containerB", 1)), + }, + }, + }, + { + name: "running can revert to pending", + pod: &v1.Pod{ + Spec: desiredState, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + }, + currentStatus: &kubecontainer.PodStatus{}, + previousStatus: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + waitingState("containerA"), + waitingState("containerB"), + }, + }, + expected: v1.PodStatus{ + Phase: v1.PodPending, + HostIP: "127.0.0.1", + QOSClass: v1.PodQOSBestEffort, + Conditions: []v1.PodCondition{ + {Type: v1.PodInitialized, Status: v1.ConditionTrue}, + {Type: v1.PodReady, Status: v1.ConditionTrue}, + {Type: v1.ContainersReady, Status: v1.ConditionTrue}, + {Type: v1.PodScheduled, Status: v1.ConditionTrue}, + }, + ContainerStatuses: []v1.ContainerStatus{ + ready(waitingStateWithReason("containerA", "ContainerCreating")), + ready(waitingStateWithReason("containerB", "ContainerCreating")), + }, + }, + }, + { + name: "reason and message are preserved when phase doesn't change", + pod: &v1.Pod{ + Spec: desiredState, + Status: v1.PodStatus{ + Phase: v1.PodRunning, + ContainerStatuses: []v1.ContainerStatus{ + waitingState("containerA"), + waitingState("containerB"), + }, + }, + }, + currentStatus: &kubecontainer.PodStatus{ + ContainerStatuses: []*kubecontainer.Status{ + { + ID: kubecontainer.ContainerID{ID: "foo"}, + Name: "containerB", + StartedAt: time.Unix(1, 0).UTC(), + State: kubecontainer.ContainerStateRunning, + }, + }, + }, + previousStatus: v1.PodStatus{ + Phase: v1.PodPending, + Reason: "Test", + Message: "test", + ContainerStatuses: []v1.ContainerStatus{ + waitingState("containerA"), + runningState("containerB"), + }, + }, + expected: v1.PodStatus{ + Phase: v1.PodPending, + Reason: "Test", + Message: "test", + HostIP: "127.0.0.1", + QOSClass: v1.PodQOSBestEffort, + Conditions: []v1.PodCondition{ + {Type: v1.PodInitialized, Status: v1.ConditionTrue}, + {Type: v1.PodReady, Status: v1.ConditionTrue}, + {Type: v1.ContainersReady, Status: v1.ConditionTrue}, + {Type: v1.PodScheduled, Status: v1.ConditionTrue}, + }, + ContainerStatuses: []v1.ContainerStatus{ + ready(waitingStateWithReason("containerA", "ContainerCreating")), + ready(withID(runningStateWithStartedAt("containerB", time.Unix(1, 0).UTC()), "://foo")), + }, + }, + }, + { + name: "reason and message are cleared when phase changes", + pod: &v1.Pod{ + Spec: desiredState, + Status: v1.PodStatus{ + Phase: v1.PodPending, + ContainerStatuses: []v1.ContainerStatus{ + waitingState("containerA"), + waitingState("containerB"), + }, + }, + }, + currentStatus: &kubecontainer.PodStatus{ + ContainerStatuses: []*kubecontainer.Status{ + { + ID: kubecontainer.ContainerID{ID: "c1"}, + Name: "containerA", + StartedAt: time.Unix(1, 0).UTC(), + State: kubecontainer.ContainerStateRunning, + }, + { + ID: kubecontainer.ContainerID{ID: "c2"}, + Name: "containerB", + StartedAt: time.Unix(2, 0).UTC(), + State: kubecontainer.ContainerStateRunning, + }, + }, + }, + previousStatus: v1.PodStatus{ + Phase: v1.PodPending, + Reason: "Test", + Message: "test", + ContainerStatuses: []v1.ContainerStatus{ + runningState("containerA"), + runningState("containerB"), + }, + }, + expected: v1.PodStatus{ + Phase: v1.PodRunning, + HostIP: "127.0.0.1", + QOSClass: v1.PodQOSBestEffort, + Conditions: []v1.PodCondition{ + {Type: v1.PodInitialized, Status: v1.ConditionTrue}, + {Type: v1.PodReady, Status: v1.ConditionTrue}, + {Type: v1.ContainersReady, Status: v1.ConditionTrue}, + {Type: v1.PodScheduled, Status: v1.ConditionTrue}, + }, + ContainerStatuses: []v1.ContainerStatus{ + ready(withID(runningStateWithStartedAt("containerA", time.Unix(1, 0).UTC()), "://c1")), + ready(withID(runningStateWithStartedAt("containerB", time.Unix(2, 0).UTC()), "://c2")), + }, + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) + defer testKubelet.Cleanup() + kl := testKubelet.kubelet + kl.statusManager.SetPodStatus(test.pod, test.previousStatus) + for _, name := range test.unreadyContainer { + kl.readinessManager.Set(kubecontainer.BuildContainerID("", findContainerStatusByName(test.expected, name).ContainerID), results.Failure, test.pod) + } + actual := kl.generateAPIPodStatus(test.pod, test.currentStatus) + if !apiequality.Semantic.DeepEqual(test.expected, actual) { + t.Fatalf("Unexpected status: %s", diff.ObjectReflectDiff(actual, test.expected)) + } + }) + } +} + +func findContainerStatusByName(status v1.PodStatus, name string) *v1.ContainerStatus { + for i, c := range status.InitContainerStatuses { + if c.Name == name { + return &status.InitContainerStatuses[i] + } + } + for i, c := range status.ContainerStatuses { + if c.Name == name { + return &status.ContainerStatuses[i] + } + } + for i, c := range status.EphemeralContainerStatuses { + if c.Name == name { + return &status.EphemeralContainerStatuses[i] + } + } + return nil +} + func TestGetExec(t *testing.T) { const ( podName = "podFoo" From c2a6d07b8f069663784456d6a8020a64a831f781 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 19 Jul 2021 17:55:18 -0400 Subject: [PATCH 2/3] kubelet: Avoid allocating multiple times during status Noticed while reviewing this code path. We can assume the temporary slice should be about the same size as it was previously. --- pkg/kubelet/kubelet_pods.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 8d8069fed83..277502633f3 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -1782,7 +1782,7 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon if isInitContainer { return kubetypes.SortStatusesOfInitContainers(pod, statuses) } - var containerStatuses []v1.ContainerStatus + containerStatuses := make([]v1.ContainerStatus, 0, len(statuses)) for _, status := range statuses { containerStatuses = append(containerStatuses, *status) } From d7ee024cc5d0bf09bb8b78941f5d4c6e3c56c676 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 19 Jul 2021 17:56:22 -0400 Subject: [PATCH 3/3] kubelet: Make condition processing in one spot The list of status conditions should be calculated all together, this made review more complex. Readability only. --- pkg/kubelet/kubelet_pods.go | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 277502633f3..136080c08f1 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -1421,21 +1421,26 @@ func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.Po } } - spec := &pod.Spec - // ensure the probe managers have up to date status for containers kl.probeManager.UpdatePodStatus(pod.UID, s) - s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(spec, s.InitContainerStatuses, s.Phase)) - s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(spec, s.Conditions, s.ContainerStatuses, s.Phase)) - s.Conditions = append(s.Conditions, status.GenerateContainersReadyCondition(spec, s.ContainerStatuses, s.Phase)) - // Status manager will take care of the LastTransitionTimestamp, either preserve - // the timestamp from apiserver, or set a new one. When kubelet sees the pod, - // `PodScheduled` condition must be true. + + // preserve all conditions not owned by the kubelet + s.Conditions = make([]v1.PodCondition, 0, len(pod.Status.Conditions)+1) + for _, c := range pod.Status.Conditions { + if !kubetypes.PodConditionByKubelet(c.Type) { + s.Conditions = append(s.Conditions, c) + } + } + // set all Kubelet-owned conditions + s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(&pod.Spec, s.InitContainerStatuses, s.Phase)) + s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(&pod.Spec, s.Conditions, s.ContainerStatuses, s.Phase)) + s.Conditions = append(s.Conditions, status.GenerateContainersReadyCondition(&pod.Spec, s.ContainerStatuses, s.Phase)) s.Conditions = append(s.Conditions, v1.PodCondition{ Type: v1.PodScheduled, Status: v1.ConditionTrue, }) + // set HostIP and initialize PodIP/PodIPs for host network pods if kl.kubeClient != nil { hostIPs, err := kl.getHostIPsAnyWay() if err != nil { @@ -1541,13 +1546,6 @@ func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontaine ) } - // Preserves conditions not controlled by kubelet - for _, c := range pod.Status.Conditions { - if !kubetypes.PodConditionByKubelet(c.Type) { - apiPodStatus.Conditions = append(apiPodStatus.Conditions, c) - } - } - return &apiPodStatus }