mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Add featuregate, feature and unit and e2e tests
Signed-off-by: lauralorenz <lauralorenz@google.com> Signed-off-by: Laura Lorenz <lauralorenz@google.com>
This commit is contained in:
parent
2261137135
commit
256a576651
@ -565,6 +565,13 @@ const (
|
||||
// Allows recursive read-only mounts.
|
||||
RecursiveReadOnlyMounts featuregate.Feature = "RecursiveReadOnlyMounts"
|
||||
|
||||
// owner: @lauralorenz
|
||||
// kep: https://kep.k8s.io/4603
|
||||
//
|
||||
// Enables support for a lower internal cluster-wide backoff maximum for restarting
|
||||
// containers (aka containers in CrashLoopBackOff)
|
||||
ReduceDefaultCrashLoopBackOffDecay featuregate.Feature = "ReduceDefaultCrashLoopBackOffDecay"
|
||||
|
||||
// owner: @adrianmoisey
|
||||
// kep: https://kep.k8s.io/4427
|
||||
//
|
||||
|
@ -622,6 +622,10 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
|
||||
{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.36
|
||||
},
|
||||
|
||||
ReduceDefaultCrashLoopBackOffDecay: {
|
||||
{Version: version.MustParse("1.33"), Default: false, PreRelease: featuregate.Alpha},
|
||||
},
|
||||
|
||||
RelaxedDNSSearchValidation: {
|
||||
{Version: version.MustParse("1.32"), Default: false, PreRelease: featuregate.Alpha},
|
||||
{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.Beta},
|
||||
|
@ -153,8 +153,19 @@ const (
|
||||
// DefaultContainerLogsDir is the location of container logs.
|
||||
DefaultContainerLogsDir = "/var/log/containers"
|
||||
|
||||
// MaxContainerBackOff is the max backoff period for container restarts, exported for the e2e test
|
||||
MaxContainerBackOff = v1beta1.MaxContainerBackOff
|
||||
// MaxCrashLoopBackOff is the max backoff period for container restarts, exported for the e2e test
|
||||
MaxCrashLoopBackOff = v1beta1.MaxContainerBackOff
|
||||
|
||||
// reducedMaxCrashLoopBackOff is the default max backoff period for container restarts when the alpha feature
|
||||
// gate ReduceDefaultCrashLoopBackOffDecay is enabled
|
||||
reducedMaxCrashLoopBackOff = 60 * time.Second
|
||||
|
||||
// Initial period for the exponential backoff for container restarts.
|
||||
initialCrashLoopBackOff = time.Second * 10
|
||||
|
||||
// reducedInitialCrashLoopBackOff is the default initial backoff period for container restarts when the alpha feature
|
||||
// gate ReduceDefaultCrashLoopBackOffDecay is enabled
|
||||
reducedInitialCrashLoopBackOff = 1 * time.Second
|
||||
|
||||
// MaxImageBackOff is the max backoff period for image pulls, exported for the e2e test
|
||||
MaxImageBackOff = 300 * time.Second
|
||||
@ -204,9 +215,6 @@ const (
|
||||
// error.
|
||||
backOffPeriod = time.Second * 10
|
||||
|
||||
// Initial period for the exponential backoff for container restarts.
|
||||
containerBackOffPeriod = time.Second * 10
|
||||
|
||||
// Initial period for the exponential backoff for image pulls.
|
||||
imageBackOffPeriod = time.Second * 10
|
||||
|
||||
@ -320,6 +328,27 @@ type Dependencies struct {
|
||||
useLegacyCadvisorStats bool
|
||||
}
|
||||
|
||||
// newCrashLoopBackOff configures the backoff maximum to be used
|
||||
// by kubelet for container restarts depending on the alpha gates
|
||||
// and kubelet configuration set
|
||||
func newCrashLoopBackOff(kubeCfg *kubeletconfiginternal.KubeletConfiguration) (time.Duration, time.Duration) {
|
||||
boMax := MaxCrashLoopBackOff
|
||||
boInitial := initialCrashLoopBackOff
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.ReduceDefaultCrashLoopBackOffDecay) {
|
||||
boMax = reducedMaxCrashLoopBackOff
|
||||
boInitial = reducedInitialCrashLoopBackOff
|
||||
}
|
||||
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
|
||||
// operator-invoked configuration always has precedence if valid
|
||||
boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration
|
||||
if boMax < boInitial {
|
||||
boInitial = boMax
|
||||
}
|
||||
}
|
||||
return boMax, boInitial
|
||||
}
|
||||
|
||||
// makePodSourceConfig creates a config.PodConfig from the given
|
||||
// KubeletConfiguration or returns an error.
|
||||
func makePodSourceConfig(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps *Dependencies, nodeName types.NodeName, nodeHasSynced func() bool) (*config.PodConfig, error) {
|
||||
@ -936,16 +965,10 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
||||
kubeDeps.Recorder,
|
||||
volumepathhandler.NewBlockVolumePathHandler())
|
||||
|
||||
boMax := MaxContainerBackOff
|
||||
base := containerBackOffPeriod
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.KubeletCrashLoopBackOffMax) {
|
||||
boMax = kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod.Duration
|
||||
if boMax < containerBackOffPeriod {
|
||||
base = boMax
|
||||
}
|
||||
}
|
||||
klet.backOff = flowcontrol.NewBackOff(base, boMax)
|
||||
klet.backOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
|
||||
boMax, base := newCrashLoopBackOff(kubeCfg)
|
||||
|
||||
klet.crashLoopBackOff = flowcontrol.NewBackOff(base, boMax)
|
||||
klet.crashLoopBackOff.HasExpiredFunc = func(eventTime time.Time, lastUpdate time.Time, maxDuration time.Duration) bool {
|
||||
return eventTime.Sub(lastUpdate) > 600*time.Second
|
||||
}
|
||||
|
||||
@ -1346,7 +1369,7 @@ type Kubelet struct {
|
||||
syncLoopMonitor atomic.Value
|
||||
|
||||
// Container restart Backoff
|
||||
backOff *flowcontrol.Backoff
|
||||
crashLoopBackOff *flowcontrol.Backoff
|
||||
|
||||
// Information about the ports which are opened by daemons on Node running this Kubelet server.
|
||||
daemonEndpoints *v1.NodeDaemonEndpoints
|
||||
@ -2033,7 +2056,7 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
|
||||
// Use WithoutCancel instead of a new context.TODO() to propagate trace context
|
||||
// Call the container runtime's SyncPod callback
|
||||
sctx := context.WithoutCancel(ctx)
|
||||
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.backOff)
|
||||
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.crashLoopBackOff)
|
||||
kl.reasonCache.Update(pod.UID, result)
|
||||
if err := result.Error(); err != nil {
|
||||
// Do not return error if the only failures were pods in backoff
|
||||
@ -2922,14 +2945,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
|
||||
for i, container := range pod.Spec.Containers {
|
||||
if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.Containers[i].Resources) {
|
||||
key := kuberuntime.GetStableKey(pod, &container)
|
||||
kl.backOff.Reset(key)
|
||||
kl.crashLoopBackOff.Reset(key)
|
||||
}
|
||||
}
|
||||
for i, container := range pod.Spec.InitContainers {
|
||||
if podutil.IsRestartableInitContainer(&container) {
|
||||
if !apiequality.Semantic.DeepEqual(container.Resources, allocatedPod.Spec.InitContainers[i].Resources) {
|
||||
key := kuberuntime.GetStableKey(pod, &container)
|
||||
kl.backOff.Reset(key)
|
||||
kl.crashLoopBackOff.Reset(key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1379,7 +1379,7 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
|
||||
}
|
||||
|
||||
// Cleanup any backoff entries.
|
||||
kl.backOff.GC()
|
||||
kl.crashLoopBackOff.GC()
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -55,6 +55,7 @@ import (
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/flowcontrol"
|
||||
"k8s.io/component-base/featuregate"
|
||||
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||
"k8s.io/component-base/metrics/testutil"
|
||||
internalapi "k8s.io/cri-api/pkg/apis"
|
||||
@ -337,8 +338,8 @@ func newTestKubeletWithImageList(
|
||||
kubelet.containerGC = containerGC
|
||||
|
||||
fakeClock := testingclock.NewFakeClock(time.Now())
|
||||
kubelet.backOff = flowcontrol.NewBackOff(time.Second, time.Minute)
|
||||
kubelet.backOff.Clock = fakeClock
|
||||
kubelet.crashLoopBackOff = flowcontrol.NewBackOff(time.Second, time.Minute)
|
||||
kubelet.crashLoopBackOff.Clock = fakeClock
|
||||
kubelet.resyncInterval = 10 * time.Second
|
||||
kubelet.workQueue = queue.NewBasicWorkQueue(fakeClock)
|
||||
// Relist period does not affect the tests.
|
||||
@ -2895,7 +2896,7 @@ func TestHandlePodResourcesResize(t *testing.T) {
|
||||
now := kubelet.clock.Now()
|
||||
// Put the container in backoff so we can confirm backoff is reset.
|
||||
backoffKey := kuberuntime.GetStableKey(originalPod, originalCtr)
|
||||
kubelet.backOff.Next(backoffKey, now)
|
||||
kubelet.crashLoopBackOff.Next(backoffKey, now)
|
||||
|
||||
updatedPod, err := kubelet.handlePodResourcesResize(newPod, podStatus)
|
||||
require.NoError(t, err)
|
||||
@ -2917,7 +2918,7 @@ func TestHandlePodResourcesResize(t *testing.T) {
|
||||
resizeStatus := kubelet.statusManager.GetPodResizeStatus(newPod.UID)
|
||||
assert.Equal(t, tt.expectedResize, resizeStatus)
|
||||
|
||||
isInBackoff := kubelet.backOff.IsInBackOffSince(backoffKey, now)
|
||||
isInBackoff := kubelet.crashLoopBackOff.IsInBackOffSince(backoffKey, now)
|
||||
if tt.expectBackoffReset {
|
||||
assert.False(t, isInBackoff, "container backoff should be reset")
|
||||
} else {
|
||||
@ -3403,7 +3404,7 @@ func TestSyncPodSpans(t *testing.T) {
|
||||
kubelet.os,
|
||||
kubelet,
|
||||
nil,
|
||||
kubelet.backOff,
|
||||
kubelet.crashLoopBackOff,
|
||||
kubeCfg.SerializeImagePulls,
|
||||
kubeCfg.MaxParallelImagePulls,
|
||||
float32(kubeCfg.RegistryPullQPS),
|
||||
@ -3933,3 +3934,86 @@ func TestIsPodResizeInProgress(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCrashLoopBackOffConfiguration(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
featureGates []featuregate.Feature
|
||||
nodeDecay metav1.Duration
|
||||
expectedInitial time.Duration
|
||||
expectedMax time.Duration
|
||||
}{
|
||||
{
|
||||
name: "Prior behavior",
|
||||
expectedMax: time.Duration(300 * time.Second),
|
||||
expectedInitial: time.Duration(10 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "New default only",
|
||||
featureGates: []featuregate.Feature{features.ReduceDefaultCrashLoopBackOffDecay},
|
||||
expectedMax: time.Duration(60 * time.Second),
|
||||
expectedInitial: time.Duration(1 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "Faster per node config; only node config configured",
|
||||
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
|
||||
nodeDecay: metav1.Duration{Duration: 2 * time.Second},
|
||||
expectedMax: time.Duration(2 * time.Second),
|
||||
expectedInitial: time.Duration(2 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "Faster per node config; new default and node config configured",
|
||||
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
|
||||
nodeDecay: metav1.Duration{Duration: 2 * time.Second},
|
||||
expectedMax: time.Duration(2 * time.Second),
|
||||
expectedInitial: time.Duration(1 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "Slower per node config; new default and node config configured, set A",
|
||||
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
|
||||
nodeDecay: metav1.Duration{Duration: 10 * time.Second},
|
||||
expectedMax: time.Duration(10 * time.Second),
|
||||
expectedInitial: time.Duration(1 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "Slower per node config; new default and node config configured, set B",
|
||||
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax, features.ReduceDefaultCrashLoopBackOffDecay},
|
||||
nodeDecay: metav1.Duration{Duration: 300 * time.Second},
|
||||
expectedMax: time.Duration(300 * time.Second),
|
||||
expectedInitial: time.Duration(1 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "Slower per node config; only node config configured, set A",
|
||||
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
|
||||
nodeDecay: metav1.Duration{Duration: 11 * time.Second},
|
||||
expectedMax: time.Duration(11 * time.Second),
|
||||
expectedInitial: time.Duration(10 * time.Second),
|
||||
},
|
||||
{
|
||||
name: "Slower per node config; only node config configured, set B",
|
||||
featureGates: []featuregate.Feature{features.KubeletCrashLoopBackOffMax},
|
||||
nodeDecay: metav1.Duration{Duration: 300 * time.Second},
|
||||
expectedMax: time.Duration(300 * time.Second),
|
||||
expectedInitial: time.Duration(10 * time.Second),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
kubeCfg := &kubeletconfiginternal.KubeletConfiguration{}
|
||||
|
||||
for _, f := range tc.featureGates {
|
||||
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, f, true)
|
||||
}
|
||||
if tc.nodeDecay.Duration > 0 {
|
||||
kubeCfg.CrashLoopBackOff.MaxContainerRestartPeriod = &tc.nodeDecay
|
||||
}
|
||||
|
||||
resultMax, resultInitial := newCrashLoopBackOff(kubeCfg)
|
||||
|
||||
assert.Equalf(t, tc.expectedMax, resultMax, "wrong max calculated, want: %v, got %v", tc.expectedMax, resultMax)
|
||||
assert.Equalf(t, tc.expectedInitial, resultInitial, "wrong base calculated, want: %v, got %v", tc.expectedInitial, resultInitial)
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -64,7 +64,7 @@ import (
|
||||
const (
|
||||
buildBackOffDuration = time.Minute
|
||||
syncLoopFrequency = 10 * time.Second
|
||||
maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxContainerBackOff))
|
||||
maxBackOffTolerance = time.Duration(1.3 * float64(kubelet.MaxCrashLoopBackOff))
|
||||
podRetryPeriod = 1 * time.Second
|
||||
)
|
||||
|
||||
@ -739,7 +739,7 @@ var _ = SIGDescribe("Pods", func() {
|
||||
})
|
||||
|
||||
podClient.CreateSync(ctx, pod)
|
||||
time.Sleep(2 * kubelet.MaxContainerBackOff) // it takes slightly more than 2*x to get to a back-off of x
|
||||
time.Sleep(2 * kubelet.MaxCrashLoopBackOff) // it takes slightly more than 2*x to get to a back-off of x
|
||||
|
||||
// wait for a delay == capped delay of MaxContainerBackOff
|
||||
ginkgo.By("getting restart delay when capped")
|
||||
@ -753,13 +753,13 @@ var _ = SIGDescribe("Pods", func() {
|
||||
framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
|
||||
}
|
||||
|
||||
if delay1 < kubelet.MaxContainerBackOff {
|
||||
if delay1 < kubelet.MaxCrashLoopBackOff {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if (delay1 < kubelet.MaxContainerBackOff) || (delay1 > maxBackOffTolerance) {
|
||||
framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxContainerBackOff, delay1)
|
||||
if (delay1 < kubelet.MaxCrashLoopBackOff) || (delay1 > maxBackOffTolerance) {
|
||||
framework.Failf("expected %s back-off got=%s in delay1", kubelet.MaxCrashLoopBackOff, delay1)
|
||||
}
|
||||
|
||||
ginkgo.By("getting restart delay after a capped delay")
|
||||
@ -768,8 +768,8 @@ var _ = SIGDescribe("Pods", func() {
|
||||
framework.Failf("timed out waiting for container restart in pod=%s/%s", podName, containerName)
|
||||
}
|
||||
|
||||
if delay2 < kubelet.MaxContainerBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
|
||||
framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxContainerBackOff, delay2)
|
||||
if delay2 < kubelet.MaxCrashLoopBackOff || delay2 > maxBackOffTolerance { // syncloop cumulative drift
|
||||
framework.Failf("expected %s back-off got=%s on delay2", kubelet.MaxCrashLoopBackOff, delay2)
|
||||
}
|
||||
})
|
||||
|
||||
|
@ -89,6 +89,54 @@ var _ = SIGDescribe("Container Restart", feature.CriProxy, framework.WithSerial(
|
||||
doTest(ctx, f, 3, containerName, 13)
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.Context("Reduced default container restart backs off as expected", func() {
|
||||
|
||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true}
|
||||
})
|
||||
|
||||
ginkgo.BeforeEach(func() {
|
||||
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
|
||||
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
|
||||
}
|
||||
})
|
||||
|
||||
ginkgo.AfterEach(func() {
|
||||
err := resetCRIProxyInjector(e2eCriProxy)
|
||||
framework.ExpectNoError(err)
|
||||
})
|
||||
|
||||
ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) {
|
||||
// 0s, 0s, 10s, 30s, 60s, 90s, 120s, 150s, 180s, 210s, 240s, 270s, 300s
|
||||
doTest(ctx, f, 3, containerName, 13)
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.Context("Lower node config container restart takes precedence", func() {
|
||||
|
||||
tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
initialConfig.FeatureGates = map[string]bool{"ReduceDefaultCrashLoopBackOffDecay": true}
|
||||
initialConfig.CrashLoopBackOff.MaxContainerRestartPeriod = &metav1.Duration{Duration: time.Duration(1 * time.Second)}
|
||||
initialConfig.FeatureGates = map[string]bool{"KubeletCrashLoopBackOffMax": true}
|
||||
})
|
||||
|
||||
ginkgo.BeforeEach(func() {
|
||||
if err := resetCRIProxyInjector(e2eCriProxy); err != nil {
|
||||
ginkgo.Skip("Skip the test since the CRI Proxy is undefined.")
|
||||
}
|
||||
})
|
||||
|
||||
ginkgo.AfterEach(func() {
|
||||
err := resetCRIProxyInjector(e2eCriProxy)
|
||||
framework.ExpectNoError(err)
|
||||
})
|
||||
|
||||
ginkgo.It("Reduced default restart backs off.", func(ctx context.Context) {
|
||||
// 0s, 0s, 1s, 2s, 3s, 4s, 5s, 6s, 7s, and so on
|
||||
doTest(ctx, f, 3, containerName, 298)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func doTest(ctx context.Context, f *framework.Framework, targetRestarts int, containerName string, maxRestarts int) {
|
||||
|
@ -1048,6 +1048,12 @@
|
||||
lockToDefault: true
|
||||
preRelease: GA
|
||||
version: "1.33"
|
||||
- name: ReduceDefaultCrashLoopBackOffDecay
|
||||
versionedSpecs:
|
||||
- default: false
|
||||
lockToDefault: false
|
||||
preRelease: Alpha
|
||||
version: "1.33"
|
||||
- name: RelaxedDNSSearchValidation
|
||||
versionedSpecs:
|
||||
- default: false
|
||||
|
Loading…
Reference in New Issue
Block a user