kubelet: Never restart containers in deleting pods

When constructing the API status of a pod, if the pod is marked for deletion no containers should be started. Previously, if a container inside of a terminating pod failed to start due to a container runtime error (that populates reasonCache) the reasonCache would remain populated (it is only updated by syncPod for non-terminating pods) and the delete action on the pod would be delayed until the reasonCache entry expired due to other pods. This dramatically reduces the amount of time the Kubelet waits to delete pods that are terminating and encountered a container runtime error.
2025-09-14 21:53:52 +00:00 · 2020-02-24 19:34:49 -05:00
parent 2364c10e2e
commit 8722c834e5
2 changed files with 28 additions and 0 deletions
--- a/pkg/kubelet/container/helpers.go
+++ b/pkg/kubelet/container/helpers.go
@@ -61,6 +61,10 @@ type RuntimeHelper interface {
 // ShouldContainerBeRestarted checks whether a container needs to be restarted.
 // TODO(yifan): Think about how to refactor this.
 func ShouldContainerBeRestarted(container *v1.Container, pod *v1.Pod, podStatus *PodStatus) bool {
+	// Once a pod has been marked deleted, it should not be restarted
+	if pod.DeletionTimestamp != nil {
+		return false
+	}
 	// Get latest container status.
 	status := podStatus.FindContainerStatusByName(container.Name)
 	// If the container was never started before, we should start it.
--- a/pkg/kubelet/container/helpers_test.go
+++ b/pkg/kubelet/container/helpers_test.go
@@ -19,6 +19,7 @@ package container
 import (
 	"reflect"
 	"testing"
+	"time"

 	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/assert"
@@ -449,6 +450,8 @@ func TestShouldContainerBeRestarted(t *testing.T) {
 		v1.RestartPolicyOnFailure,
 		v1.RestartPolicyAlways,
 	}
+
+	// test policies
 	expected := map[string][]bool{
 		"no-history": {true, true, true},
 		"alive":      {false, false, false},
@@ -467,6 +470,27 @@ func TestShouldContainerBeRestarted(t *testing.T) {
 			}
 		}
 	}
+
+	// test deleted pod
+	pod.DeletionTimestamp = &metav1.Time{Time: time.Now()}
+	expected = map[string][]bool{
+		"no-history": {false, false, false},
+		"alive":      {false, false, false},
+		"succeed":    {false, false, false},
+		"failed":     {false, false, false},
+		"unknown":    {false, false, false},
+	}
+	for _, c := range pod.Spec.Containers {
+		for i, policy := range policies {
+			pod.Spec.RestartPolicy = policy
+			e := expected[c.Name][i]
+			r := ShouldContainerBeRestarted(&c, pod, podStatus)
+			if r != e {
+				t.Errorf("Restart for container %q with restart policy %q expected %t, got %t",
+					c.Name, policy, e, r)
+			}
+		}
+	}
 }

 func TestHasPrivilegedContainer(t *testing.T) {