Override terminationLivenessGracePeriod for probes

This commit is contained in:
Elana Hashman 2021-03-08 18:06:39 -08:00
parent c88512d34f
commit 9fb6e712ff
No known key found for this signature in database
GPG Key ID: D37F7B2A20B48FA0
5 changed files with 43 additions and 10 deletions

View File

@ -227,7 +227,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart)
if handlerErr != nil {
m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil); err != nil {
if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", reasonFailedPostStartHook, nil); err != nil {
klog.ErrorS(fmt.Errorf("%s: %v", ErrPostStartHook, handlerErr), "Failed to kill container", "pod", klog.KObj(pod),
"podUID", pod.UID, "containerName", container.Name, "containerID", kubeContainerID.String())
}
@ -596,7 +596,7 @@ func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(containerID
// killContainer kills a container through the following steps:
// * Run the pre-stop lifecycle hooks (if applicable).
// * Stop the container.
func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, gracePeriodOverride *int64) error {
func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, message string, reason containerKillReason, gracePeriodOverride *int64) error {
var containerSpec *v1.Container
if pod != nil {
if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil {
@ -619,6 +619,19 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec
gracePeriod = *pod.DeletionGracePeriodSeconds
case pod.Spec.TerminationGracePeriodSeconds != nil:
gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
if utilfeature.DefaultFeatureGate.Enabled(features.ProbeTerminationGracePeriod) {
switch reason {
case reasonStartupProbe:
if containerSpec.StartupProbe != nil && containerSpec.StartupProbe.TerminationGracePeriodSeconds != nil {
gracePeriod = *containerSpec.StartupProbe.TerminationGracePeriodSeconds
}
case reasonLivenessProbe:
if containerSpec.LivenessProbe != nil && containerSpec.LivenessProbe.TerminationGracePeriodSeconds != nil {
gracePeriod = *containerSpec.LivenessProbe.TerminationGracePeriodSeconds
}
}
}
}
if len(message) == 0 {
@ -672,7 +685,7 @@ func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(pod *v1.Pod, ru
defer wg.Done()
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
if err := m.killContainer(pod, container.ID, container.Name, "", gracePeriodOverride); err != nil {
if err := m.killContainer(pod, container.ID, container.Name, "", reasonUnknown, gracePeriodOverride); err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
klog.ErrorS(err, "Kill container failed", "pod", klog.KObj(pod), "podUID", pod.UID,
"containerName", container.Name, "containerID", container.ID)

View File

@ -120,7 +120,7 @@ func TestKillContainer(t *testing.T) {
}
for _, test := range tests {
err := m.killContainer(test.pod, test.containerID, test.containerName, test.reason, &test.gracePeriodOverride)
err := m.killContainer(test.pod, test.containerID, test.containerName, test.reason, "", &test.gracePeriodOverride)
if test.succeed != (err == nil) {
t.Errorf("%s: expected %v, got %v (%v)", test.caseName, test.succeed, (err == nil), err)
}
@ -290,7 +290,7 @@ func TestLifeCycleHook(t *testing.T) {
// Configured and works as expected
t.Run("PreStop-CMDExec", func(t *testing.T) {
testPod.Spec.Containers[0].Lifecycle = cmdLifeCycle
m.killContainer(testPod, cID, "foo", "testKill", &gracePeriod)
m.killContainer(testPod, cID, "foo", "testKill", "", &gracePeriod)
if fakeRunner.Cmd[0] != cmdLifeCycle.PreStop.Exec.Command[0] {
t.Errorf("CMD Prestop hook was not invoked")
}
@ -300,7 +300,7 @@ func TestLifeCycleHook(t *testing.T) {
t.Run("PreStop-HTTPGet", func(t *testing.T) {
defer func() { fakeHTTP.url = "" }()
testPod.Spec.Containers[0].Lifecycle = httpLifeCycle
m.killContainer(testPod, cID, "foo", "testKill", &gracePeriod)
m.killContainer(testPod, cID, "foo", "testKill", "", &gracePeriod)
if !strings.Contains(fakeHTTP.url, httpLifeCycle.PreStop.HTTPGet.Host) {
t.Errorf("HTTP Prestop hook was not invoked")
@ -314,7 +314,7 @@ func TestLifeCycleHook(t *testing.T) {
testPod.DeletionGracePeriodSeconds = &gracePeriodLocal
testPod.Spec.TerminationGracePeriodSeconds = &gracePeriodLocal
m.killContainer(testPod, cID, "foo", "testKill", &gracePeriodLocal)
m.killContainer(testPod, cID, "foo", "testKill", "", &gracePeriodLocal)
if strings.Contains(fakeHTTP.url, httpLifeCycle.PreStop.HTTPGet.Host) {
t.Errorf("HTTP Should not execute when gracePeriod is 0")

View File

@ -137,7 +137,7 @@ func (cgc *containerGC) removeOldestN(containers []containerGCInfo, toRemove int
ID: containers[i].id,
}
message := "Container is in unknown state, try killing it before removal"
if err := cgc.manager.killContainer(nil, id, containers[i].name, message, nil); err != nil {
if err := cgc.manager.killContainer(nil, id, containers[i].name, message, reasonUnknown, nil); err != nil {
klog.Errorf("Failed to stop container %q: %v", containers[i].id, err)
continue
}

View File

@ -403,6 +403,16 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err
return result, nil
}
// containerKillReason explains what killed a given container
type containerKillReason string
const (
reasonStartupProbe containerKillReason = "StartupProbe"
reasonLivenessProbe containerKillReason = "LivenessProbe"
reasonFailedPostStartHook containerKillReason = "FailedPostStartHook"
reasonUnknown containerKillReason = "Unknown"
)
// containerToKillInfo contains necessary information to kill a container.
type containerToKillInfo struct {
// The spec of the container.
@ -411,6 +421,9 @@ type containerToKillInfo struct {
name string
// The message indicates why the container will be killed.
message string
// The reason is a clearer source of info on why a container will be killed
// TODO: replace message with reason?
reason containerKillReason
}
// podActions keeps information what to do for a pod.
@ -582,6 +595,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku
container: next,
message: fmt.Sprintf("Init container is in %q state, try killing it before restart",
initLastStatus.State),
reason: reasonUnknown,
}
}
changes.NextInitContainerToStart = next
@ -623,6 +637,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku
container: &pod.Spec.Containers[idx],
message: fmt.Sprintf("Container is in %q state, try killing it before restart",
containerStatus.State),
reason: reasonUnknown,
}
}
}
@ -630,6 +645,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku
}
// The container is running, but kill the container if any of the following condition is met.
var message string
var reason containerKillReason
restart := shouldRestartOnFailure(pod)
if _, _, changed := containerChanged(&container, containerStatus); changed {
message = fmt.Sprintf("Container %s definition changed", container.Name)
@ -639,9 +655,11 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku
} else if liveness, found := m.livenessManager.Get(containerStatus.ID); found && liveness == proberesults.Failure {
// If the container failed the liveness probe, we should kill it.
message = fmt.Sprintf("Container %s failed liveness probe", container.Name)
reason = reasonLivenessProbe
} else if startup, found := m.startupManager.Get(containerStatus.ID); found && startup == proberesults.Failure {
// If the container failed the startup probe, we should kill it.
message = fmt.Sprintf("Container %s failed startup probe", container.Name)
reason = reasonStartupProbe
} else {
// Keep the container.
keepCount++
@ -660,6 +678,7 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku
name: containerStatus.Name,
container: &pod.Spec.Containers[idx],
message: message,
reason: reason,
}
klog.V(2).InfoS("Message for Container of pod", "containerName", container.Name, "containerStatusID", containerStatus.ID, "pod", klog.KObj(pod), "containerMessage", message)
}
@ -720,7 +739,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine
klog.V(3).InfoS("Killing unwanted container for pod", "containerName", containerInfo.name, "containerID", containerID, "pod", klog.KObj(pod))
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerInfo.name)
result.AddSyncResult(killContainerResult)
if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, nil); err != nil {
if err := m.killContainer(pod, containerID, containerInfo.name, containerInfo.message, containerInfo.reason, nil); err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
klog.ErrorS(err, "killContainer for pod failed", "containerName", containerInfo.name, "containerID", containerID, "pod", klog.KObj(pod))
return

View File

@ -1002,9 +1002,10 @@ func getKillMapWithInitContainers(pod *v1.Pod, status *kubecontainer.PodStatus,
func verifyActions(t *testing.T, expected, actual *podActions, desc string) {
if actual.ContainersToKill != nil {
// Clear the message field since we don't need to verify the message.
// Clear the message and reason fields since we don't need to verify them.
for k, info := range actual.ContainersToKill {
info.message = ""
info.reason = ""
actual.ContainersToKill[k] = info
}
}