Merge pull request #102821 from ehashman/phase-fix

Ensure kubelet statuses can handle loss of container runtime state
This commit is contained in:
Kubernetes Prow Robot 2021-06-28 15:38:40 -07:00 committed by GitHub
commit 15d3c3a5e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 121 additions and 13 deletions

View File

@ -1529,11 +1529,12 @@ func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.Po
spec := &pod.Spec
allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...)
s.Phase = getPhase(spec, allStatus)
klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "phase", s.Phase)
// Check for illegal phase transition
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
// API server shows terminal phase; transitions are not allowed
if s.Phase != pod.Status.Phase {
klog.ErrorS(nil, "Pod attempted illegal phase transition", "originalStatusPhase", pod.Status.Phase, "apiStatusPhase", s.Phase, "apiStatus", s)
klog.ErrorS(nil, "Pod attempted illegal phase transition", "pod", klog.KObj(pod), "originalStatusPhase", pod.Status.Phase, "apiStatusPhase", s.Phase, "apiStatus", s)
// Force back to phase from the API server
s.Phase = pod.Status.Phase
}
@ -1736,8 +1737,6 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
oldStatus, found := oldStatuses[container.Name]
if found {
if oldStatus.State.Terminated != nil {
// Do not update status on terminated init containers as
// they be removed at any time.
status = &oldStatus
} else {
// Apply some values from the old statuses as the default values.
@ -1760,7 +1759,7 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
continue
}
// if no container is found, then assuming it should be waiting seems plausible, but the status code requires
// that a previous termination be present. If we're offline long enough (or something removed the container?), then
// that a previous termination be present. If we're offline long enough or something removed the container, then
// the previous termination may not be present. This next code block ensures that if the container was previously running
// then when that container status disappears, we can infer that it terminated even if we don't know the status code.
// By setting the lasttermination state we are able to leave the container status waiting and present more accurate
@ -1779,21 +1778,17 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
continue
}
if pod.DeletionTimestamp == nil {
continue
}
// and if the pod itself is being deleted, then the CRI may have removed the container already and for whatever reason the kubelet missed the exit code
// (this seems not awesome). We know at this point that we will not be restarting the container.
// If we're here, we know the pod was previously running, but doesn't have a terminated status. We will check now to
// see if it's in a pending state.
status := statuses[container.Name]
// if the status we're about to write indicates the default, the Waiting status will force this pod back into Pending.
// That isn't true, we know the pod is going away.
// If the status we're about to write indicates the default, the Waiting status will force this pod back into Pending.
// That isn't true, we know the pod was previously running.
isDefaultWaitingStatus := status.State.Waiting != nil && status.State.Waiting.Reason == ContainerCreating
if hasInitContainers {
isDefaultWaitingStatus = status.State.Waiting != nil && status.State.Waiting.Reason == PodInitializing
}
if !isDefaultWaitingStatus {
// we the status was written, don't override
// the status was written, don't override
continue
}
if status.LastTerminationState.Terminated != nil {
@ -1809,6 +1804,12 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
Message: "The container could not be located when the pod was deleted. The container used to be Running",
ExitCode: 137,
}
// If the pod was not deleted, then it's been restarted. Increment restart count.
if pod.DeletionTimestamp == nil {
status.RestartCount += 1
}
statuses[container.Name] = status
}

View File

@ -1782,6 +1782,22 @@ func failedState(cName string) v1.ContainerStatus {
},
}
}
func waitingWithLastTerminationUnknown(cName string, restartCount int32) v1.ContainerStatus {
return v1.ContainerStatus{
Name: cName,
State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ContainerCreating"},
},
LastTerminationState: v1.ContainerState{
Terminated: &v1.ContainerStateTerminated{
Reason: "ContainerStatusUnknown",
Message: "The container could not be located when the pod was deleted. The container used to be Running",
ExitCode: 137,
},
},
RestartCount: restartCount,
}
}
func TestPodPhaseWithRestartAlways(t *testing.T) {
desiredState := v1.PodSpec{
@ -2306,6 +2322,97 @@ func TestPodPhaseWithRestartOnFailure(t *testing.T) {
// func TestPodPhaseWithRestartOnFailureInitContainers(t *testing.T) {
// }
func TestConvertToAPIContainerStatuses(t *testing.T) {
desiredState := v1.PodSpec{
NodeName: "machine",
Containers: []v1.Container{
{Name: "containerA"},
{Name: "containerB"},
},
RestartPolicy: v1.RestartPolicyAlways,
}
now := metav1.Now()
tests := []struct {
name string
pod *v1.Pod
currentStatus *kubecontainer.PodStatus
previousStatus []v1.ContainerStatus
containers []v1.Container
hasInitContainers bool
isInitContainer bool
expected []v1.ContainerStatus
}{
{
name: "no current status, with previous statuses and deletion",
pod: &v1.Pod{
Spec: desiredState,
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
runningState("containerA"),
runningState("containerB"),
},
},
ObjectMeta: metav1.ObjectMeta{Name: "my-pod", DeletionTimestamp: &now},
},
currentStatus: &kubecontainer.PodStatus{},
previousStatus: []v1.ContainerStatus{
runningState("containerA"),
runningState("containerB"),
},
containers: desiredState.Containers,
// no init containers
// is not an init container
expected: []v1.ContainerStatus{
waitingWithLastTerminationUnknown("containerA", 0),
waitingWithLastTerminationUnknown("containerB", 0),
},
},
{
name: "no current status, with previous statuses and no deletion",
pod: &v1.Pod{
Spec: desiredState,
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
runningState("containerA"),
runningState("containerB"),
},
},
},
currentStatus: &kubecontainer.PodStatus{},
previousStatus: []v1.ContainerStatus{
runningState("containerA"),
runningState("containerB"),
},
containers: desiredState.Containers,
// no init containers
// is not an init container
expected: []v1.ContainerStatus{
waitingWithLastTerminationUnknown("containerA", 1),
waitingWithLastTerminationUnknown("containerB", 1),
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
defer testKubelet.Cleanup()
kl := testKubelet.kubelet
containerStatuses := kl.convertToAPIContainerStatuses(
test.pod,
test.currentStatus,
test.previousStatus,
test.containers,
test.hasInitContainers,
test.isInitContainer,
)
for i, status := range containerStatuses {
assert.Equal(t, test.expected[i], status, "[test %s]", test.name)
}
})
}
}
func TestGetExec(t *testing.T) {
const (
podName = "podFoo"