Merge pull request #130711 from lauralorenz/crashloopbackoff-featuregate-reducedefaultcrashloopbackoffdelay

KEP-4603: Add ReduceDefaultCrashLoopBackoffDecay featuregate, feature and unit and e2e tests
2025-07-23 11:50:44 +00:00 · 2025-03-13 14:55:47 -07:00 · 2025-03-13 14:55:47 -07:00 · 396c3ebe4c
commit 396c3ebe4c
parent 020c4b7c65 256a576651
8 changed files with 204 additions and 32 deletions
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@ -565,6 +565,13 @@ const (
 	// Allows recursive read-only mounts.
 	RecursiveReadOnlyMounts featuregate.Feature = "RecursiveReadOnlyMounts"

+	// owner: @lauralorenz
+	// kep: https://kep.k8s.io/4603
+	//
+	// Enables support for a lower internal cluster-wide backoff maximum for restarting
+	// containers (aka containers in CrashLoopBackOff)
+	ReduceDefaultCrashLoopBackOffDecay featuregate.Feature = "ReduceDefaultCrashLoopBackOffDecay"
+
 	// owner: @adrianmoisey
 	// kep: https://kep.k8s.io/4427
 	//
--- a/pkg/features/versioned_kube_features.go
+++ b/pkg/features/versioned_kube_features.go
@ -625,6 +625,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
 		{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.36
 	},

+	ReduceDefaultCrashLoopBackOffDecay: {
+		{Version: version.MustParse("1.33"), Default: false, PreRelease: featuregate.Alpha},
+	},
+
 	RelaxedDNSSearchValidation: {
 		{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
 		{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.Beta},
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -154,8 +154,19 @@ const (
 	// DefaultContainerLogsDir is the location of container logs.
 	DefaultContainerLogsDir = "/var/log/containers"

-	// MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test
-	MaxContainerBackOff = v1beta1.MaxContainerBackOff
+	// MaxCrashLoopBackOff is the max backoff period for container restarts, exported for the e2e test
+	MaxCrashLoopBackOff = v1beta1.MaxContainerBackOff
+
+	// reducedMaxCrashLoopBackOff is the default max backoff period for container restarts when the alpha feature
+	// gate ReduceDefaultCrashLoopBackOffDecay is enabled
+	reducedMaxCrashLoopBackOff = 60 * time.Second
+
+	// Initial period for the exponential backoff for container restarts.
+	initialCrashLoopBackOff = time.Second * 10
+
+	// reducedInitialCrashLoopBackOff is the default initial backoff period for container restarts when the alpha feature
+	// gate ReduceDefaultCrashLoopBackOffDecay is enabled
+	reducedInitialCrashLoopBackOff = 1 * time.Second

 	// MaxImageBackOff is the max backoff period for image pulls, exported for the e2e test
 	MaxImageBackOff = 300 * time.Second
@ -205,9 +216,6 @@ const (
 	// error.
 	backOffPeriod = time.Second * 10

-	// Initial period for the exponential backoff for container restarts.
-	containerBackOffPeriod = time.Second * 10
-
 	// Initial period for the exponential backoff for image pulls.
 	imageBackOffPeriod = time.Second * 10

@ -322,6 +330,27 @@ type Dependencies struct {
 	useLegacyCadvisorStats bool
 }

+// newCrashLoopBackOff configures the backoff maximum to be used
+// by kubelet for container restarts depending on the alpha gates
+// and kubelet configuration set
+func newCrashLoopBackOff(kubeCfg *kubeletconfiginternal.KubeletConfiguration) (time.Duration, time.Duration) {
+	boMax := MaxCrashLoopBackOff
+	boInitial := initialCrashLoopBackOff
+	if utilfeature.DefaultFeatureGate.Enabled(features.ReduceDefaultCrashLoopBackOffDecay) {
+		boMax = reducedMaxCrashLoopBackOff
+		boInitial = reducedInitialCrashLoopBackOff
+	}
+
+	if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
+		// operator-invoked configuration always has precedence if valid
+		boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration
+		if boMax < boInitial {
+			boInitial = boMax
+		}
+	}
+	return boMax, boInitial
+}
+
 // makePodSourceConfig creates a config.PodConfig from the given
 // KubeletConfiguration or returns an error.
 func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) {
@ -939,16 +968,10 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		kubeDeps.Recorder,
 		volumepathhandler.NewBlockVolumePathHandler())

-	boMax := MaxContainerBackOff
-	base := containerBackOffPeriod
-	if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
-		boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration
-		if boMax < containerBackOffPeriod {
-			base = boMax
-		}
-	}
-	klet.backOff = flowcontrol.NewBackOff(base, boMax)
-	klet.backOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
+	boMax, base := newCrashLoopBackOff(kubeCfg)
+
+	klet.crashLoopBackOff = flowcontrol.NewBackOff(base, boMax)
+	klet.crashLoopBackOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
 		return eventTime.Sub(lastUpdate) > 600*time.Second
 	}

@ -1349,7 +1372,7 @@ type Kubelet struct {
 	syncLoopMonitor atomic.Value

 	// Container restart Backoff
-	backOff *flowcontrol.Backoff
+	crashLoopBackOff *flowcontrol.Backoff

 	// Information about the ports which are opened by daemons on Node running this Kubelet server.
 	daemonEndpoints *v1.NodeDaemonEndpoints
@ -2039,7 +2062,7 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
 	// Use WithoutCancel instead of a new context.TODO() to propagate trace context
 	// Call the container runtime's SyncPod callback
 	sctx := context.WithoutCancel(ctx)
-	result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff)
+	result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.crashLoopBackOff)
 	kl.reasonCache.Update(pod.UID, result)
 	if err := result.Error(); err != nil {
 		// Do not return error if the only failures were pods in backoff
@ -2928,14 +2951,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
 		for i, container := range pod.Spec.Containers {
 			if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.Containers[i].Resources) {
 				key := kuberuntime.GetStableKey(pod, &container)
-				kl.backOff.Reset(key)
+				kl.crashLoopBackOff.Reset(key)
 			}
 		}
 		for i, container := range pod.Spec.InitContainers {
 			if podutil.IsRestartableInitContainer(&container) {
 				if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.InitContainers[i].Resources) {
 					key := kuberuntime.GetStableKey(pod, &container)
-					kl.backOff.Reset(key)
+					kl.crashLoopBackOff.Reset(key)
 				}
 			}
 		}
--- a/pkg/kubelet/kubelet_pods.go
+++ b/pkg/kubelet/kubelet_pods.go
@ -1383,7 +1383,7 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
 	}

 	// Cleanup any backoff entries.
-	kl.backOff.GC()
+	kl.crashLoopBackOff.GC()
 	return nil
 }

--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@ -55,6 +55,7 @@ import (
 	"k8s.io/client-go/kubernetes/fake"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/client-go/util/flowcontrol"
+	"k8s.io/component-base/featuregate"
 	featuregatetesting "k8s.io/component-base/featuregate/testing"
 	"k8s.io/component-base/metrics/testutil"
 	internalapi "k8s.io/cri-api/pkg/apis"
@ -338,8 +339,8 @@ func newTestKubeletWithImageList(
 	kubelet.containerGC = containerGC

 	fakeClock := testingclock.NewFakeClock(time.Now())
-	kubelet.backOff = flowcontrol.NewBackOff(time.Second, time.Minute)
-	kubelet.backOff.Clock = fakeClock
+	kubelet.crashLoopBackOff = flowcontrol.NewBackOff(time.Second, time.Minute)
+	kubelet.crashLoopBackOff.Clock = fakeClock
 	kubelet.resyncInterval = 10 * time.Second
 	kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock)
 	// Relist period does not affect the tests.
@ -2900,7 +2901,7 @@ func TestHandlePodResourcesResize(t *testing.T) {
 				now := kubelet.clock.Now()
 				// Put the container in backoff so we can confirm backoff is reset.
 				backoffKey := kuberuntime.GetStableKey(originalPod, originalCtr)
-				kubelet.backOff.Next(backoffKey, now)
+				kubelet.crashLoopBackOff.Next(backoffKey, now)

 				updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus)
 				require.NoError(t, err)
@ -2922,7 +2923,7 @@ func TestHandlePodResourcesResize(t *testing.T) {
 				resizeStatus := kubelet.statusManager.GetPodResizeStatus(newPod.UID)
 				assert.Equal(t, tt.expectedResize, resizeStatus)

-				isInBackoff := kubelet.backOff.IsInBackOffSince(backoffKey, now)
+				isInBackoff := kubelet.crashLoopBackOff.IsInBackOffSince(backoffKey, now)
 				if tt.expectBackoffReset {
 					assert.False(t, isInBackoff, "container backoff should be reset")
 				} else {
@ -3408,7 +3409,7 @@ func TestSyncPodSpans(t *testing.T) {
 		kubelet.os,
 		kubelet,
 		nil,
-		kubelet.backOff,
+		kubelet.crashLoopBackOff,
 		kubeCfg.SerializeImagePulls,
 		kubeCfg.MaxParallelImagePulls,
 		float32(kubeCfg.RegistryPullQPS),
@ -3938,3 +3939,86 @@ func TestIsPodResizeInProgress(t *testing.T) {
 		})
 	}
 }
+
+func TestCrashLoopBackOffConfiguration(t *testing.T) {
+	testCases := []struct {
+		name            string
+		featureGates    []featuregate.Feature
+		nodeDecay       metav1.Duration
+		expectedInitial time.Duration
+		expectedMax     time.Duration
+	}{
+		{
+			name:            "Prior behavior",
+			expectedMax:     time.Duration(300 * time.Second),
+			expectedInitial: time.Duration(10 * time.Second),
+		},
+		{
+			name:            "New default only",
+			featureGates:    []featuregate.Feature{features.ReduceDefaultCrashLoopBackOffDecay},
+			expectedMax:     time.Duration(60 * time.Second),
+			expectedInitial: time.Duration(1 * time.Second),
+		},
+		{
+			name:            "Faster per node config; only node config configured",
+			featureGates:    []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
+			nodeDecay:       metav1.Duration{Duration: 2 * time.Second},
+			expectedMax:     time.Duration(2 * time.Second),
+			expectedInitial: time.Duration(2 * time.Second),
+		},
+		{
+			name:            "Faster per node config; new default and node config configured",
+			featureGates:    []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
+			nodeDecay:       metav1.Duration{Duration: 2 * time.Second},
+			expectedMax:     time.Duration(2 * time.Second),
+			expectedInitial: time.Duration(1 * time.Second),
+		},
+		{
+			name:            "Slower per node config; new default and node config configured, set A",
+			featureGates:    []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
+			nodeDecay:       metav1.Duration{Duration: 10 * time.Second},
+			expectedMax:     time.Duration(10 * time.Second),
+			expectedInitial: time.Duration(1 * time.Second),
+		},
+		{
+			name:            "Slower per node config; new default and node config configured, set B",
+			featureGates:    []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
+			nodeDecay:       metav1.Duration{Duration: 300 * time.Second},
+			expectedMax:     time.Duration(300 * time.Second),
+			expectedInitial: time.Duration(1 * time.Second),
+		},
+		{
+			name:            "Slower per node config; only node config configured, set A",
+			featureGates:    []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
+			nodeDecay:       metav1.Duration{Duration: 11 * time.Second},
+			expectedMax:     time.Duration(11 * time.Second),
+			expectedInitial: time.Duration(10 * time.Second),
+		},
+		{
+			name:            "Slower per node config; only node config configured, set B",
+			featureGates:    []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
+			nodeDecay:       metav1.Duration{Duration: 300 * time.Second},
+			expectedMax:     time.Duration(300 * time.Second),
+			expectedInitial: time.Duration(10 * time.Second),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			kubeCfg := &kubeletconfiginternal.KubeletConfiguration{}
+
+			for _, f := range tc.featureGates {
+				featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, f, true)
+			}
+			if tc.nodeDecay.Duration > 0 {
+				kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod = &tc.nodeDecay
+			}
+
+			resultMax, resultInitial := newCrashLoopBackOff(kubeCfg)
+
+			assert.Equalf(t, tc.expectedMax, resultMax, "wrong max calculated, want: %v, got %v", tc.expectedMax, resultMax)
+			assert.Equalf(t, tc.expectedInitial, resultInitial, "wrong base calculated, want: %v, got %v", tc.expectedInitial, resultInitial)
+		})
+	}
+
+}
--- a/test/e2e/common/node/pods.go
+++ b/test/e2e/common/node/pods.go
@ -64,7 +64,7 @@ import (
 const (
 	buildBackOffDuration = time.Minute
 	syncLoopFrequency    = 10 * time.Second
-	maxBackOffTolerance  = time.Duration(1.3 * float64(kubelet.MaxContainerBackOff))
+	maxBackOffTolerance  = time.Duration(1.3 * float64(kubelet.MaxCrashLoopBackOff))
 	podRetryPeriod       = 1 * time.Second
 )

@ -739,7 +739,7 @@ var _ = SIGDescribe("Pods", func() {
 		})

 		podClient.CreateSync(ctx, pod)
-		time.Sleep(2 * kubelet.MaxContainerBackOff) // it takes slightly more than 2*x to get to a back-off of x
+		time.Sleep(2 * kubelet.MaxCrashLoopBackOff) // it takes slightly more than 2*x to get to a back-off of x

 		// wait for a delay == capped delay of MaxContainerBackOff
 		ginkgo.By("getting restart delay when capped")
@ -753,13 +753,13 @@ var _ = SIGDescribe("Pods", func() {
 				framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
 			}

-			if delay1 < kubelet.MaxContainerBackOff {
+			if delay1 < kubelet.MaxCrashLoopBackOff {
 				continue
 			}
 		}

-		if (delay1 < kubelet.MaxContainerBackOff) || (delay1 > maxBackOffTolerance) {
-			framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxContainerBackOff, delay1)
+		if (delay1 < kubelet.MaxCrashLoopBackOff) || (delay1 > maxBackOffTolerance) {
+			framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxCrashLoopBackOff, delay1)
 		}

 		ginkgo.By("getting restart delay after a capped delay")
@ -768,8 +768,8 @@ var _ = SIGDescribe("Pods", func() {
 			framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
 		}

-		if delay2 < kubelet.MaxContainerBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
-			framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxContainerBackOff, delay2)
+		if delay2 < kubelet.MaxCrashLoopBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
+			framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxCrashLoopBackOff, delay2)
 		}
 	})

--- a/test/e2e_node/container_restart_test.go
+++ b/test/e2e_node/container_restart_test.go
@ -85,6 +85,54 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(
 			doTest(ctx, f, 3, containerName, 13)
 		})
 	})
+
+	ginkgo.Context("Reduced default container restart backs off as expected", func() {
+
+		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
+			initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true}
+		})
+
+		ginkgo.BeforeEach(func() {
+			if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
+				ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
+			}
+		})
+
+		ginkgo.AfterEach(func() {
+			err := resetCRIProxyInjector(e2eCriProxy)
+			framework.ExpectNoError(err)
+		})
+
+		ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) {
+			// 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s
+			doTest(ctx, f, 3, containerName, 13)
+		})
+	})
+
+	ginkgo.Context("Lower node config container restart takes precedence", func() {
+
+		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
+			initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true}
+			initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(1 * time.Second)}
+			initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true}
+		})
+
+		ginkgo.BeforeEach(func() {
+			if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
+				ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
+			}
+		})
+
+		ginkgo.AfterEach(func() {
+			err := resetCRIProxyInjector(e2eCriProxy)
+			framework.ExpectNoError(err)
+		})
+
+		ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) {
+			// 0s, 0s, 1s, 2s, 3s, 4s, 5s, 6s, 7s, and so on
+			doTest(ctx, f, 3, containerName, 298)
+		})
+	})
 })

 func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) {
--- a/test/featuregates_linter/test_data/versioned_feature_list.yaml
+++ b/test/featuregates_linter/test_data/versioned_feature_list.yaml
@ -1060,6 +1060,12 @@
    lockToDefault: true
    preRelease: GA
    version: "1.33"
+- name: ReduceDefaultCrashLoopBackOffDecay
+  versionedSpecs:
+  - default: false
+    lockToDefault: false
+    preRelease: Alpha
+    version: "1.33"
 - name: RelaxedDNSSearchValidation
  versionedSpecs:
  - default: false