From eb4641d65129af621f8aafae75844562da718c5c Mon Sep 17 00:00:00 2001 From: Tim Allclair Date: Thu, 8 May 2025 13:07:46 -0700 Subject: [PATCH] Add container Resources to the backoff key --- pkg/kubelet/kubelet.go | 15 ---- pkg/kubelet/kubelet_test.go | 20 ------ pkg/kubelet/kuberuntime/helpers.go | 22 ++++-- pkg/kubelet/kuberuntime/helpers_test.go | 91 ++++++++++++++++++++----- 4 files changed, 91 insertions(+), 57 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index aa10c740dce..161e04fbdd9 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -51,7 +51,6 @@ import ( "k8s.io/utils/ptr" v1 "k8s.io/api/core/v1" - apiequality "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" @@ -3003,20 +3002,6 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine // added back as needed in the defer block, but this prevents old errors from being preserved. kl.statusManager.ClearPodResizeInProgressCondition(pod.UID) - for i, container := range pod.Spec.Containers { - if !apiequality.Semantic.DeepEqual(container.Resources, podFromAllocation.Spec.Containers[i].Resources) { - key := kuberuntime.GetBackoffKey(pod, &container) - kl.crashLoopBackOff.Reset(key) - } - } - for i, container := range pod.Spec.InitContainers { - if podutil.IsRestartableInitContainer(&container) { - if !apiequality.Semantic.DeepEqual(container.Resources, podFromAllocation.Spec.InitContainers[i].Resources) { - key := kuberuntime.GetBackoffKey(pod, &container) - kl.crashLoopBackOff.Reset(key) - } - } - } return pod, nil } diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index bc3e12cb75f..89d8b1b1543 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -2917,7 +2917,6 @@ func TestHandlePodResourcesResize(t *testing.T) { expectedAllocatedReqs v1.ResourceList expectedAllocatedLims v1.ResourceList expectedResize []*v1.PodCondition - expectBackoffReset bool annotations map[string]string }{ { @@ -2925,7 +2924,6 @@ func TestHandlePodResourcesResize(t *testing.T) { originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -2939,7 +2937,6 @@ func TestHandlePodResourcesResize(t *testing.T) { originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, newRequests: v1.ResourceList{v1.ResourceCPU: cpu1500m, v1.ResourceMemory: mem500M}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu1500m, v1.ResourceMemory: mem500M}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -2953,7 +2950,6 @@ func TestHandlePodResourcesResize(t *testing.T) { originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M}, newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem1500M}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem1500M}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -3064,7 +3060,6 @@ func TestHandlePodResourcesResize(t *testing.T) { originalRequests: v1.ResourceList{v1.ResourceCPU: cpu2m}, newRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu1000m}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -3078,7 +3073,6 @@ func TestHandlePodResourcesResize(t *testing.T) { originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m}, newRequests: v1.ResourceList{v1.ResourceCPU: cpu2m}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu2m}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -3095,7 +3089,6 @@ func TestHandlePodResourcesResize(t *testing.T) { newLimits: v1.ResourceList{v1.ResourceCPU: resource.MustParse("20m")}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")}, expectedAllocatedLims: v1.ResourceList{v1.ResourceCPU: resource.MustParse("20m")}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -3112,7 +3105,6 @@ func TestHandlePodResourcesResize(t *testing.T) { newLimits: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")}, expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")}, expectedAllocatedLims: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")}, - expectBackoffReset: true, expectedResize: []*v1.PodCondition{ { @@ -3185,11 +3177,6 @@ func TestHandlePodResourcesResize(t *testing.T) { setContainerStatus(podStatus, &c, i+len(originalPod.Spec.InitContainers)) } - now := kubelet.clock.Now() - // Put the container in backoff so we can confirm backoff is reset. - backoffKey := kuberuntime.GetBackoffKey(originalPod, originalCtr) - kubelet.crashLoopBackOff.Next(backoffKey, now) - updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus) require.NoError(t, err) @@ -3218,13 +3205,6 @@ func TestHandlePodResourcesResize(t *testing.T) { resizeStatus[i].Message = tt.expectedResize[i].Message } assert.Equal(t, tt.expectedResize, resizeStatus) - - isInBackoff := kubelet.crashLoopBackOff.IsInBackOffSince(backoffKey, now) - if tt.expectBackoffReset { - assert.False(t, isInBackoff, "container backoff should be reset") - } else { - assert.True(t, isInBackoff, "container backoff should not be reset") - } }) } } diff --git a/pkg/kubelet/kuberuntime/helpers.go b/pkg/kubelet/kuberuntime/helpers.go index ad1e7bb8d18..2d82ac0e8e6 100644 --- a/pkg/kubelet/kuberuntime/helpers.go +++ b/pkg/kubelet/kuberuntime/helpers.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "hash/fnv" "path/filepath" "strconv" "strings" @@ -175,12 +176,23 @@ func isInitContainerFailed(status *kubecontainer.Status) bool { return false } -// GetBackoffKey generates a key (string) to uniquely identify a -// (pod, container) tuple. The key should include the content of the -// container, so that any change to the container generates a new key. +// GetBackoffKey generates a key (string) to uniquely identify a (pod, container) tuple for tracking +// container backoff. The key should include any content of the container that is tied to the +// backoff, so that any change generates a new key. func GetBackoffKey(pod *v1.Pod, container *v1.Container) string { - hash := strconv.FormatUint(kubecontainer.HashContainer(container), 16) - return fmt.Sprintf("%s_%s_%s_%s_%s", pod.Name, pod.Namespace, string(pod.UID), container.Name, hash) + // Include stable identifiers (name, namespace, uid) as well as any + // fields that should reset the backoff when changed. + key := []string{ + pod.Name, + pod.Namespace, + string(pod.UID), + container.Name, + container.Image, + container.Resources.String(), + } + hash := fnv.New64a() + hash.Write([]byte(strings.Join(key, "/"))) + return strconv.FormatUint(hash.Sum64(), 16) } // logPathDelimiter is the delimiter used in the log path. diff --git a/pkg/kubelet/kuberuntime/helpers_test.go b/pkg/kubelet/kuberuntime/helpers_test.go index cd53e4fd6da..569e3ab5ac4 100644 --- a/pkg/kubelet/kuberuntime/helpers_test.go +++ b/pkg/kubelet/kuberuntime/helpers_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -107,27 +108,83 @@ func TestIsInitContainerFailed(t *testing.T) { } } -func TestStableKey(t *testing.T) { - container := &v1.Container{ - Name: "test_container", - Image: "foo/image:v1", - } - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test_pod", - Namespace: "test_pod_namespace", - UID: "test_pod_uid", +func TestGetBackoffKey(t *testing.T) { + testSpecs := map[string]v1.PodSpec{ + "empty resources": { + Containers: []v1.Container{{ + Name: "test_container", + Image: "foo/image:v1", + }}, }, - Spec: v1.PodSpec{ - Containers: []v1.Container{*container}, + "with resources": { + Containers: []v1.Container{{ + Name: "test_container", + Image: "foo/image:v1", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("100m"), + v1.ResourceMemory: resource.MustParse("100Mi"), + }, + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("200Mi"), + }, + }, + }}, }, } - oldKey := GetBackoffKey(pod, container) - // Updating the container image should change the key. - container.Image = "foo/image:v2" - newKey := GetBackoffKey(pod, container) - assert.NotEqual(t, oldKey, newKey) + for name, spec := range testSpecs { + t.Run(name, func(t *testing.T) { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test_pod", + Namespace: "test_pod_namespace", + UID: "test_pod_uid", + }, + Spec: spec, + } + secondContainer := v1.Container{ + Name: "second_container", + Image: "registry.k8s.io/pause", + } + pod.Spec.Containers = append(pod.Spec.Containers, secondContainer) + originalKey := GetBackoffKey(pod, &pod.Spec.Containers[0]) + + podCopy := pod.DeepCopy() + podCopy.Spec.ActiveDeadlineSeconds = int64Ptr(1) + assert.Equal(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]), + "Unrelated change should not change the key") + + podCopy = pod.DeepCopy() + assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[1]), + "Different container change should change the key") + + podCopy = pod.DeepCopy() + podCopy.Name = "other-pod" + assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]), + "Different pod name should change the key") + + podCopy = pod.DeepCopy() + podCopy.Namespace = "other-namespace" + assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]), + "Different pod namespace should change the key") + + podCopy = pod.DeepCopy() + podCopy.Spec.Containers[0].Image = "foo/image:v2" + assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]), + "Updating the container image should change the key") + + podCopy = pod.DeepCopy() + c := &podCopy.Spec.Containers[0] + if c.Resources.Requests == nil { + c.Resources.Requests = v1.ResourceList{} + } + c.Resources.Requests[v1.ResourceCPU] = resource.MustParse("200m") + assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]), + "Updating the resources should change the key") + }) + } } func TestToKubeContainer(t *testing.T) {