Add container Resources to the backoff key

This commit is contained in:
Tim Allclair
2025-05-08 13:07:46 -07:00
parent 8b81a3d883
commit eb4641d651
4 changed files with 91 additions and 57 deletions

View File

@@ -51,7 +51,6 @@ import (
"k8s.io/utils/ptr"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
@@ -3003,20 +3002,6 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
// added back as needed in the defer block, but this prevents old errors from being preserved.
kl.statusManager.ClearPodResizeInProgressCondition(pod.UID)
for i, container := range pod.Spec.Containers {
if !apiequality.Semantic.DeepEqual(container.Resources, podFromAllocation.Spec.Containers[i].Resources) {
key := kuberuntime.GetBackoffKey(pod, &container)
kl.crashLoopBackOff.Reset(key)
}
}
for i, container := range pod.Spec.InitContainers {
if podutil.IsRestartableInitContainer(&container) {
if !apiequality.Semantic.DeepEqual(container.Resources, podFromAllocation.Spec.InitContainers[i].Resources) {
key := kuberuntime.GetBackoffKey(pod, &container)
kl.crashLoopBackOff.Reset(key)
}
}
}
return pod, nil
}

View File

@@ -2917,7 +2917,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
expectedAllocatedReqs v1.ResourceList
expectedAllocatedLims v1.ResourceList
expectedResize []*v1.PodCondition
expectBackoffReset bool
annotations map[string]string
}{
{
@@ -2925,7 +2924,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M},
newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem500M},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -2939,7 +2937,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M},
newRequests: v1.ResourceList{v1.ResourceCPU: cpu1500m, v1.ResourceMemory: mem500M},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu1500m, v1.ResourceMemory: mem500M},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -2953,7 +2950,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m, v1.ResourceMemory: mem1000M},
newRequests: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem1500M},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu500m, v1.ResourceMemory: mem1500M},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -3064,7 +3060,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
originalRequests: v1.ResourceList{v1.ResourceCPU: cpu2m},
newRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu1000m},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -3078,7 +3073,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
originalRequests: v1.ResourceList{v1.ResourceCPU: cpu1000m},
newRequests: v1.ResourceList{v1.ResourceCPU: cpu2m},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: cpu2m},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -3095,7 +3089,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
newLimits: v1.ResourceList{v1.ResourceCPU: resource.MustParse("20m")},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")},
expectedAllocatedLims: v1.ResourceList{v1.ResourceCPU: resource.MustParse("20m")},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -3112,7 +3105,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
newLimits: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")},
expectedAllocatedReqs: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")},
expectedAllocatedLims: v1.ResourceList{v1.ResourceCPU: resource.MustParse("10m")},
expectBackoffReset: true,
expectedResize: []*v1.PodCondition{
{
@@ -3185,11 +3177,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
setContainerStatus(podStatus, &c, i+len(originalPod.Spec.InitContainers))
}
now := kubelet.clock.Now()
// Put the container in backoff so we can confirm backoff is reset.
backoffKey := kuberuntime.GetBackoffKey(originalPod, originalCtr)
kubelet.crashLoopBackOff.Next(backoffKey, now)
updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus)
require.NoError(t, err)
@@ -3218,13 +3205,6 @@ func TestHandlePodResourcesResize(t *testing.T) {
resizeStatus[i].Message = tt.expectedResize[i].Message
}
assert.Equal(t, tt.expectedResize, resizeStatus)
isInBackoff := kubelet.crashLoopBackOff.IsInBackOffSince(backoffKey, now)
if tt.expectBackoffReset {
assert.False(t, isInBackoff, "container backoff should be reset")
} else {
assert.True(t, isInBackoff, "container backoff should not be reset")
}
})
}
}

View File

@@ -20,6 +20,7 @@ import (
"context"
"errors"
"fmt"
"hash/fnv"
"path/filepath"
"strconv"
"strings"
@@ -175,12 +176,23 @@ func isInitContainerFailed(status *kubecontainer.Status) bool {
return false
}
// GetBackoffKey generates a key (string) to uniquely identify a
// (pod, container) tuple. The key should include the content of the
// container, so that any change to the container generates a new key.
// GetBackoffKey generates a key (string) to uniquely identify a (pod, container) tuple for tracking
// container backoff. The key should include any content of the container that is tied to the
// backoff, so that any change generates a new key.
func GetBackoffKey(pod *v1.Pod, container *v1.Container) string {
hash := strconv.FormatUint(kubecontainer.HashContainer(container), 16)
return fmt.Sprintf("%s_%s_%s_%s_%s", pod.Name, pod.Namespace, string(pod.UID), container.Name, hash)
// Include stable identifiers (name, namespace, uid) as well as any
// fields that should reset the backoff when changed.
key := []string{
pod.Name,
pod.Namespace,
string(pod.UID),
container.Name,
container.Image,
container.Resources.String(),
}
hash := fnv.New64a()
hash.Write([]byte(strings.Join(key, "/")))
return strconv.FormatUint(hash.Sum64(), 16)
}
// logPathDelimiter is the delimiter used in the log path.

View File

@@ -23,6 +23,7 @@ import (
"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@@ -107,27 +108,83 @@ func TestIsInitContainerFailed(t *testing.T) {
}
}
func TestStableKey(t *testing.T) {
container := &v1.Container{
Name: "test_container",
Image: "foo/image:v1",
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test_pod",
Namespace: "test_pod_namespace",
UID: "test_pod_uid",
func TestGetBackoffKey(t *testing.T) {
testSpecs := map[string]v1.PodSpec{
"empty resources": {
Containers: []v1.Container{{
Name: "test_container",
Image: "foo/image:v1",
}},
},
Spec: v1.PodSpec{
Containers: []v1.Container{*container},
"with resources": {
Containers: []v1.Container{{
Name: "test_container",
Image: "foo/image:v1",
Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("100m"),
v1.ResourceMemory: resource.MustParse("100Mi"),
},
Limits: v1.ResourceList{
v1.ResourceCPU: resource.MustParse("200m"),
v1.ResourceMemory: resource.MustParse("200Mi"),
},
},
}},
},
}
oldKey := GetBackoffKey(pod, container)
// Updating the container image should change the key.
container.Image = "foo/image:v2"
newKey := GetBackoffKey(pod, container)
assert.NotEqual(t, oldKey, newKey)
for name, spec := range testSpecs {
t.Run(name, func(t *testing.T) {
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test_pod",
Namespace: "test_pod_namespace",
UID: "test_pod_uid",
},
Spec: spec,
}
secondContainer := v1.Container{
Name: "second_container",
Image: "registry.k8s.io/pause",
}
pod.Spec.Containers = append(pod.Spec.Containers, secondContainer)
originalKey := GetBackoffKey(pod, &pod.Spec.Containers[0])
podCopy := pod.DeepCopy()
podCopy.Spec.ActiveDeadlineSeconds = int64Ptr(1)
assert.Equal(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]),
"Unrelated change should not change the key")
podCopy = pod.DeepCopy()
assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[1]),
"Different container change should change the key")
podCopy = pod.DeepCopy()
podCopy.Name = "other-pod"
assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]),
"Different pod name should change the key")
podCopy = pod.DeepCopy()
podCopy.Namespace = "other-namespace"
assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]),
"Different pod namespace should change the key")
podCopy = pod.DeepCopy()
podCopy.Spec.Containers[0].Image = "foo/image:v2"
assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]),
"Updating the container image should change the key")
podCopy = pod.DeepCopy()
c := &podCopy.Spec.Containers[0]
if c.Resources.Requests == nil {
c.Resources.Requests = v1.ResourceList{}
}
c.Resources.Requests[v1.ResourceCPU] = resource.MustParse("200m")
assert.NotEqual(t, originalKey, GetBackoffKey(podCopy, &podCopy.Spec.Containers[0]),
"Updating the resources should change the key")
})
}
}
func TestToKubeContainer(t *testing.T) {