diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index dcbcc0589fc..5720576e07f 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -227,7 +227,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart) if handlerErr != nil { m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg) - if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil); err != nil { + if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", reasonFailedPostStartHook, nil); err != nil { klog.ErrorS(fmt.Errorf("%s: %v", ErrPostStartHook, handlerErr), "Failed to kill container", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String()) } @@ -596,7 +596,7 @@ func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(containerID // killContainer kills a container through the following steps: // * Run the pre-stop lifecycle hooks (if applicable). // * Stop the container. -func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, gracePeriodOverride *int64) error { +func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, reason containerKillReason, gracePeriodOverride *int64) error { var containerSpec *v1.Container if pod != nil { if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil { @@ -619,6 +619,19 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec gracePeriod = *pod.DeletionGracePeriodSeconds case pod.Spec.TerminationGracePeriodSeconds != nil: gracePeriod = *pod.Spec.TerminationGracePeriodSeconds + + if utilfeature.DefaultFeatureGate.Enabled(features.ProbeTerminationGracePeriod) { + switch reason { + case reasonStartupProbe: + if containerSpec.StartupProbe != nil && containerSpec.StartupProbe.TerminationGracePeriodSeconds != nil { + gracePeriod = *containerSpec.StartupProbe.TerminationGracePeriodSeconds + } + case reasonLivenessProbe: + if containerSpec.LivenessProbe != nil && containerSpec.LivenessProbe.TerminationGracePeriodSeconds != nil { + gracePeriod = *containerSpec.LivenessProbe.TerminationGracePeriodSeconds + } + } + } } if len(message) == 0 { @@ -672,7 +685,7 @@ func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(pod *v1.Pod, ru defer wg.Done() killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name) - if err := m.killContainer(pod, container.ID, container.Name, "", gracePeriodOverride); err != nil { + if err := m.killContainer(pod, container.ID, container.Name, "", reasonUnknown, gracePeriodOverride); err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) klog.ErrorS(err, "Kill container failed", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "containerID", container.ID) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_test.go index c1c053348c1..51ad8c73c9a 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_test.go @@ -120,7 +120,7 @@ func TestKillContainer(t *testing.T) { } for _, test := range tests { - err := m.killContainer(test.pod, test.containerID, test.containerName, test.reason, &test.gracePeriodOverride) + err := m.killContainer(test.pod, test.containerID, test.containerName, test.reason, "", &test.gracePeriodOverride) if test.succeed != (err == nil) { t.Errorf("%s: expected %v, got %v (%v)", test.caseName, test.succeed, (err == nil), err) } @@ -290,7 +290,7 @@ func TestLifeCycleHook(t *testing.T) { // Configured and works as expected t.Run("PreStop-CMDExec", func(t *testing.T) { testPod.Spec.Containers[0].Lifecycle = cmdLifeCycle - m.killContainer(testPod, cID, "foo", "testKill", &gracePeriod) + m.killContainer(testPod, cID, "foo", "testKill", "", &gracePeriod) if fakeRunner.Cmd[0] != cmdLifeCycle.PreStop.Exec.Command[0] { t.Errorf("CMD Prestop hook was not invoked") } @@ -300,7 +300,7 @@ func TestLifeCycleHook(t *testing.T) { t.Run("PreStop-HTTPGet", func(t *testing.T) { defer func() { fakeHTTP.url = "" }() testPod.Spec.Containers[0].Lifecycle = httpLifeCycle - m.killContainer(testPod, cID, "foo", "testKill", &gracePeriod) + m.killContainer(testPod, cID, "foo", "testKill", "", &gracePeriod) if !strings.Contains(fakeHTTP.url, httpLifeCycle.PreStop.HTTPGet.Host) { t.Errorf("HTTP Prestop hook was not invoked") @@ -314,7 +314,7 @@ func TestLifeCycleHook(t *testing.T) { testPod.DeletionGracePeriodSeconds = &gracePeriodLocal testPod.Spec.TerminationGracePeriodSeconds = &gracePeriodLocal - m.killContainer(testPod, cID, "foo", "testKill", &gracePeriodLocal) + m.killContainer(testPod, cID, "foo", "testKill", "", &gracePeriodLocal) if strings.Contains(fakeHTTP.url, httpLifeCycle.PreStop.HTTPGet.Host) { t.Errorf("HTTP Should not execute when gracePeriod is 0") diff --git a/pkg/kubelet/kuberuntime/kuberuntime_gc.go b/pkg/kubelet/kuberuntime/kuberuntime_gc.go index 8c4f786db9b..b88eb97b72c 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_gc.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_gc.go @@ -137,7 +137,7 @@ func (cgc *containerGC) removeOldestN(containers []containerGCInfo, toRemove int ID: containers[i].id, } message := "Container is in unknown state, try killing it before removal" - if err := cgc.manager.killContainer(nil, id, containers[i].name, message, nil); err != nil { + if err := cgc.manager.killContainer(nil, id, containers[i].name, message, reasonUnknown, nil); err != nil { klog.Errorf("Failed to stop container %q: %v", containers[i].id, err) continue } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 4071b904026..e82ba53d979 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -403,6 +403,16 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err return result, nil } +// containerKillReason explains what killed a given container +type containerKillReason string + +const ( + reasonStartupProbe containerKillReason = "StartupProbe" + reasonLivenessProbe containerKillReason = "LivenessProbe" + reasonFailedPostStartHook containerKillReason = "FailedPostStartHook" + reasonUnknown containerKillReason = "Unknown" +) + // containerToKillInfo contains necessary information to kill a container. type containerToKillInfo struct { // The spec of the container. @@ -411,6 +421,9 @@ type containerToKillInfo struct { name string // The message indicates why the container will be killed. message string + // The reason is a clearer source of info on why a container will be killed + // TODO: replace message with reason? + reason containerKillReason } // podActions keeps information what to do for a pod. @@ -582,6 +595,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku container: next, message: fmt.Sprintf("Init container is in %q state, try killing it before restart", initLastStatus.State), + reason: reasonUnknown, } } changes.NextInitContainerToStart = next @@ -623,6 +637,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku container: &pod.Spec.Containers[idx], message: fmt.Sprintf("Container is in %q state, try killing it before restart", containerStatus.State), + reason: reasonUnknown, } } } @@ -630,6 +645,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku } // The container is running, but kill the container if any of the following condition is met. var message string + var reason containerKillReason restart := shouldRestartOnFailure(pod) if _, _, changed := containerChanged(&container, containerStatus); changed { message = fmt.Sprintf("Container %s definition changed", container.Name) @@ -639,9 +655,11 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku } else if liveness, found := m.livenessManager.Get(containerStatus.ID); found && liveness == proberesults.Failure { // If the container failed the liveness probe, we should kill it. message = fmt.Sprintf("Container %s failed liveness probe", container.Name) + reason = reasonLivenessProbe } else if startup, found := m.startupManager.Get(containerStatus.ID); found && startup == proberesults.Failure { // If the container failed the startup probe, we should kill it. message = fmt.Sprintf("Container %s failed startup probe", container.Name) + reason = reasonStartupProbe } else { // Keep the container. keepCount++ @@ -660,6 +678,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku name: containerStatus.Name, container: &pod.Spec.Containers[idx], message: message, + reason: reason, } klog.V(2).InfoS("Message for Container of pod", "containerName", container.Name, "containerStatusID", containerStatus.ID, "pod", klog.KObj(pod), "containerMessage", message) } @@ -720,7 +739,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine klog.V(3).InfoS("Killing unwanted container for pod", "containerName", containerInfo.name, "containerID", containerID, "pod", klog.KObj(pod)) killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerInfo.name) result.AddSyncResult(killContainerResult) - if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, nil); err != nil { + if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, containerInfo.reason, nil); err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) klog.ErrorS(err, "killContainer for pod failed", "containerName", containerInfo.name, "containerID", containerID, "pod", klog.KObj(pod)) return diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go index 7af16beb85a..2859457614d 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go @@ -1002,9 +1002,10 @@ func getKillMapWithInitContainers(pod *v1.Pod, status *kubecontainer.PodStatus, func verifyActions(t *testing.T, expected, actual *podActions, desc string) { if actual.ContainersToKill != nil { - // Clear the message field since we don't need to verify the message. + // Clear the message and reason fields since we don't need to verify them. for k, info := range actual.ContainersToKill { info.message = "" + info.reason = "" actual.ContainersToKill[k] = info } }