diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 46c9a756017..5b9e4bdc79d 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -565,6 +565,13 @@ const ( // Allows recursive read-only mounts. RecursiveReadOnlyMounts featuregate.Feature = "RecursiveReadOnlyMounts" + // owner: @lauralorenz + // kep: https://kep.k8s.io/4603 + // + // Enables support for a lower internal cluster-wide backoff maximum for restarting + // containers (aka containers in CrashLoopBackOff) + ReduceDefaultCrashLoopBackOffDecay featuregate.Feature = "ReduceDefaultCrashLoopBackOffDecay" + // owner: @adrianmoisey // kep: https://kep.k8s.io/4427 // diff --git a/pkg/features/versioned_kube_features.go b/pkg/features/versioned_kube_features.go index 36ce02c8480..49be4d7f7a0 100644 --- a/pkg/features/versioned_kube_features.go +++ b/pkg/features/versioned_kube_features.go @@ -622,6 +622,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate {Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.36 }, + ReduceDefaultCrashLoopBackOffDecay: { + {Version: version.MustParse("1.33"), Default: false, PreRelease: featuregate.Alpha}, + }, + RelaxedDNSSearchValidation: { {Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha}, {Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.Beta}, diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 4a182c9a8ec..2c4a4b5c018 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -153,8 +153,19 @@ const ( // DefaultContainerLogsDir is the location of container logs. DefaultContainerLogsDir = "/var/log/containers" - // MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test - MaxContainerBackOff = v1beta1.MaxContainerBackOff + // MaxCrashLoopBackOff is the max backoff period for container restarts, exported for the e2e test + MaxCrashLoopBackOff = v1beta1.MaxContainerBackOff + + // reducedMaxCrashLoopBackOff is the default max backoff period for container restarts when the alpha feature + // gate ReduceDefaultCrashLoopBackOffDecay is enabled + reducedMaxCrashLoopBackOff = 60 * time.Second + + // Initial period for the exponential backoff for container restarts. + initialCrashLoopBackOff = time.Second * 10 + + // reducedInitialCrashLoopBackOff is the default initial backoff period for container restarts when the alpha feature + // gate ReduceDefaultCrashLoopBackOffDecay is enabled + reducedInitialCrashLoopBackOff = 1 * time.Second // MaxImageBackOff is the max backoff period for image pulls, exported for the e2e test MaxImageBackOff = 300 * time.Second @@ -204,9 +215,6 @@ const ( // error. backOffPeriod = time.Second * 10 - // Initial period for the exponential backoff for container restarts. - containerBackOffPeriod = time.Second * 10 - // Initial period for the exponential backoff for image pulls. imageBackOffPeriod = time.Second * 10 @@ -320,6 +328,27 @@ type Dependencies struct { useLegacyCadvisorStats bool } +// newCrashLoopBackOff configures the backoff maximum to be used +// by kubelet for container restarts depending on the alpha gates +// and kubelet configuration set +func newCrashLoopBackOff(kubeCfg *kubeletconfiginternal.KubeletConfiguration) (time.Duration, time.Duration) { + boMax := MaxCrashLoopBackOff + boInitial := initialCrashLoopBackOff + if utilfeature.DefaultFeatureGate.Enabled(features.ReduceDefaultCrashLoopBackOffDecay) { + boMax = reducedMaxCrashLoopBackOff + boInitial = reducedInitialCrashLoopBackOff + } + + if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) { + // operator-invoked configuration always has precedence if valid + boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration + if boMax < boInitial { + boInitial = boMax + } + } + return boMax, boInitial +} + // makePodSourceConfig creates a config.PodConfig from the given // KubeletConfiguration or returns an error. func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) { @@ -936,16 +965,10 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps.Recorder, volumepathhandler.NewBlockVolumePathHandler()) - boMax := MaxContainerBackOff - base := containerBackOffPeriod - if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) { - boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration - if boMax < containerBackOffPeriod { - base = boMax - } - } - klet.backOff = flowcontrol.NewBackOff(base, boMax) - klet.backOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool { + boMax, base := newCrashLoopBackOff(kubeCfg) + + klet.crashLoopBackOff = flowcontrol.NewBackOff(base, boMax) + klet.crashLoopBackOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool { return eventTime.Sub(lastUpdate) > 600*time.Second } @@ -1346,7 +1369,7 @@ type Kubelet struct { syncLoopMonitor atomic.Value // Container restart Backoff - backOff *flowcontrol.Backoff + crashLoopBackOff *flowcontrol.Backoff // Information about the ports which are opened by daemons on Node running this Kubelet server. daemonEndpoints *v1.NodeDaemonEndpoints @@ -2033,7 +2056,7 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType // Use WithoutCancel instead of a new context.TODO() to propagate trace context // Call the container runtime's SyncPod callback sctx := context.WithoutCancel(ctx) - result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff) + result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.crashLoopBackOff) kl.reasonCache.Update(pod.UID, result) if err := result.Error(); err != nil { // Do not return error if the only failures were pods in backoff @@ -2922,14 +2945,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine for i, container := range pod.Spec.Containers { if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.Containers[i].Resources) { key := kuberuntime.GetStableKey(pod, &container) - kl.backOff.Reset(key) + kl.crashLoopBackOff.Reset(key) } } for i, container := range pod.Spec.InitContainers { if podutil.IsRestartableInitContainer(&container) { if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.InitContainers[i].Resources) { key := kuberuntime.GetStableKey(pod, &container) - kl.backOff.Reset(key) + kl.crashLoopBackOff.Reset(key) } } } diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 429507c9159..08bbccaeb66 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -1379,7 +1379,7 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { } // Cleanup any backoff entries. - kl.backOff.GC() + kl.crashLoopBackOff.GC() return nil } diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index d8c59dd3a6d..7cd9268869e 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -55,6 +55,7 @@ import ( "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/flowcontrol" + "k8s.io/component-base/featuregate" featuregatetesting "k8s.io/component-base/featuregate/testing" "k8s.io/component-base/metrics/testutil" internalapi "k8s.io/cri-api/pkg/apis" @@ -337,8 +338,8 @@ func newTestKubeletWithImageList( kubelet.containerGC = containerGC fakeClock := testingclock.NewFakeClock(time.Now()) - kubelet.backOff = flowcontrol.NewBackOff(time.Second, time.Minute) - kubelet.backOff.Clock = fakeClock + kubelet.crashLoopBackOff = flowcontrol.NewBackOff(time.Second, time.Minute) + kubelet.crashLoopBackOff.Clock = fakeClock kubelet.resyncInterval = 10 * time.Second kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock) // Relist period does not affect the tests. @@ -2895,7 +2896,7 @@ func TestHandlePodResourcesResize(t *testing.T) { now := kubelet.clock.Now() // Put the container in backoff so we can confirm backoff is reset. backoffKey := kuberuntime.GetStableKey(originalPod, originalCtr) - kubelet.backOff.Next(backoffKey, now) + kubelet.crashLoopBackOff.Next(backoffKey, now) updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus) require.NoError(t, err) @@ -2917,7 +2918,7 @@ func TestHandlePodResourcesResize(t *testing.T) { resizeStatus := kubelet.statusManager.GetPodResizeStatus(newPod.UID) assert.Equal(t, tt.expectedResize, resizeStatus) - isInBackoff := kubelet.backOff.IsInBackOffSince(backoffKey, now) + isInBackoff := kubelet.crashLoopBackOff.IsInBackOffSince(backoffKey, now) if tt.expectBackoffReset { assert.False(t, isInBackoff, "container backoff should be reset") } else { @@ -3403,7 +3404,7 @@ func TestSyncPodSpans(t *testing.T) { kubelet.os, kubelet, nil, - kubelet.backOff, + kubelet.crashLoopBackOff, kubeCfg.SerializeImagePulls, kubeCfg.MaxParallelImagePulls, float32(kubeCfg.RegistryPullQPS), @@ -3933,3 +3934,86 @@ func TestIsPodResizeInProgress(t *testing.T) { }) } } + +func TestCrashLoopBackOffConfiguration(t *testing.T) { + testCases := []struct { + name string + featureGates []featuregate.Feature + nodeDecay metav1.Duration + expectedInitial time.Duration + expectedMax time.Duration + }{ + { + name: "Prior behavior", + expectedMax: time.Duration(300 * time.Second), + expectedInitial: time.Duration(10 * time.Second), + }, + { + name: "New default only", + featureGates: []featuregate.Feature{features.ReduceDefaultCrashLoopBackOffDecay}, + expectedMax: time.Duration(60 * time.Second), + expectedInitial: time.Duration(1 * time.Second), + }, + { + name: "Faster per node config; only node config configured", + featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax}, + nodeDecay: metav1.Duration{Duration: 2 * time.Second}, + expectedMax: time.Duration(2 * time.Second), + expectedInitial: time.Duration(2 * time.Second), + }, + { + name: "Faster per node config; new default and node config configured", + featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay}, + nodeDecay: metav1.Duration{Duration: 2 * time.Second}, + expectedMax: time.Duration(2 * time.Second), + expectedInitial: time.Duration(1 * time.Second), + }, + { + name: "Slower per node config; new default and node config configured, set A", + featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay}, + nodeDecay: metav1.Duration{Duration: 10 * time.Second}, + expectedMax: time.Duration(10 * time.Second), + expectedInitial: time.Duration(1 * time.Second), + }, + { + name: "Slower per node config; new default and node config configured, set B", + featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay}, + nodeDecay: metav1.Duration{Duration: 300 * time.Second}, + expectedMax: time.Duration(300 * time.Second), + expectedInitial: time.Duration(1 * time.Second), + }, + { + name: "Slower per node config; only node config configured, set A", + featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax}, + nodeDecay: metav1.Duration{Duration: 11 * time.Second}, + expectedMax: time.Duration(11 * time.Second), + expectedInitial: time.Duration(10 * time.Second), + }, + { + name: "Slower per node config; only node config configured, set B", + featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax}, + nodeDecay: metav1.Duration{Duration: 300 * time.Second}, + expectedMax: time.Duration(300 * time.Second), + expectedInitial: time.Duration(10 * time.Second), + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + kubeCfg := &kubeletconfiginternal.KubeletConfiguration{} + + for _, f := range tc.featureGates { + featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, f, true) + } + if tc.nodeDecay.Duration > 0 { + kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod = &tc.nodeDecay + } + + resultMax, resultInitial := newCrashLoopBackOff(kubeCfg) + + assert.Equalf(t, tc.expectedMax, resultMax, "wrong max calculated, want: %v, got %v", tc.expectedMax, resultMax) + assert.Equalf(t, tc.expectedInitial, resultInitial, "wrong base calculated, want: %v, got %v", tc.expectedInitial, resultInitial) + }) + } + +} diff --git a/test/e2e/common/node/pods.go b/test/e2e/common/node/pods.go index 31eed98d661..bea2c02b910 100644 --- a/test/e2e/common/node/pods.go +++ b/test/e2e/common/node/pods.go @@ -64,7 +64,7 @@ import ( const ( buildBackOffDuration = time.Minute syncLoopFrequency = 10 * time.Second - maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxContainerBackOff)) + maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxCrashLoopBackOff)) podRetryPeriod = 1 * time.Second ) @@ -739,7 +739,7 @@ var _ = SIGDescribe("Pods", func() { }) podClient.CreateSync(ctx, pod) - time.Sleep(2 * kubelet.MaxContainerBackOff) // it takes slightly more than 2*x to get to a back-off of x + time.Sleep(2 * kubelet.MaxCrashLoopBackOff) // it takes slightly more than 2*x to get to a back-off of x // wait for a delay == capped delay of MaxContainerBackOff ginkgo.By("getting restart delay when capped") @@ -753,13 +753,13 @@ var _ = SIGDescribe("Pods", func() { framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) } - if delay1 < kubelet.MaxContainerBackOff { + if delay1 < kubelet.MaxCrashLoopBackOff { continue } } - if (delay1 < kubelet.MaxContainerBackOff) || (delay1 > maxBackOffTolerance) { - framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxContainerBackOff, delay1) + if (delay1 < kubelet.MaxCrashLoopBackOff) || (delay1 > maxBackOffTolerance) { + framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxCrashLoopBackOff, delay1) } ginkgo.By("getting restart delay after a capped delay") @@ -768,8 +768,8 @@ var _ = SIGDescribe("Pods", func() { framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName) } - if delay2 < kubelet.MaxContainerBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift - framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxContainerBackOff, delay2) + if delay2 < kubelet.MaxCrashLoopBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift + framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxCrashLoopBackOff, delay2) } }) diff --git a/test/e2e_node/container_restart_test.go b/test/e2e_node/container_restart_test.go index 086ce2d7d4c..7ed4992043d 100644 --- a/test/e2e_node/container_restart_test.go +++ b/test/e2e_node/container_restart_test.go @@ -89,6 +89,54 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial( doTest(ctx, f, 3, containerName, 13) }) }) + + ginkgo.Context("Reduced default container restart backs off as expected", func() { + + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true} + }) + + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) { + // 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s + doTest(ctx, f, 3, containerName, 13) + }) + }) + + ginkgo.Context("Lower node config container restart takes precedence", func() { + + tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true} + initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(1 * time.Second)} + initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true} + }) + + ginkgo.BeforeEach(func() { + if err := resetCRIProxyInjector(e2eCriProxy); err != nil { + ginkgo.Skip("Skip the test since the CRI Proxy is undefined.") + } + }) + + ginkgo.AfterEach(func() { + err := resetCRIProxyInjector(e2eCriProxy) + framework.ExpectNoError(err) + }) + + ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) { + // 0s, 0s, 1s, 2s, 3s, 4s, 5s, 6s, 7s, and so on + doTest(ctx, f, 3, containerName, 298) + }) + }) }) func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) { diff --git a/test/featuregates_linter/test_data/versioned_feature_list.yaml b/test/featuregates_linter/test_data/versioned_feature_list.yaml index fa9db33fc57..f9d029c55bf 100644 --- a/test/featuregates_linter/test_data/versioned_feature_list.yaml +++ b/test/featuregates_linter/test_data/versioned_feature_list.yaml @@ -1048,6 +1048,12 @@ lockToDefault: true preRelease: GA version: "1.33" +- name: ReduceDefaultCrashLoopBackOffDecay + versionedSpecs: + - default: false + lockToDefault: false + preRelease: Alpha + version: "1.33" - name: RelaxedDNSSearchValidation versionedSpecs: - default: false