From d82f6069708ff25ecff48464bee82bc570b55641 Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Wed, 17 Nov 2021 11:46:51 +0800 Subject: [PATCH 1/5] Add field for KubeletConfiguration and Regenerate --- api/api-rules/violation_exceptions.list | 1 + pkg/features/kube_features.go | 7 ++++ pkg/kubelet/apis/config/types.go | 17 +++++++++ .../config/v1beta1/zz_generated.conversion.go | 34 +++++++++++++++++ .../apis/config/zz_generated.deepcopy.go | 21 +++++++++++ .../k8s.io/kubelet/config/v1beta1/types.go | 37 +++++++++++++++++++ .../config/v1beta1/zz_generated.deepcopy.go | 21 +++++++++++ 7 files changed, 138 insertions(+) diff --git a/api/api-rules/violation_exceptions.list b/api/api-rules/violation_exceptions.list index d8ea748b82e..b1279ab3e0a 100644 --- a/api/api-rules/violation_exceptions.list +++ b/api/api-rules/violation_exceptions.list @@ -381,6 +381,7 @@ API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfi API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,EnforceNodeAllocatable API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,RegisterWithTaints API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ReservedMemory +API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ShutdownGracePeriodByPodPriority API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,TLSCipherSuites API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1alpha1,PodMetrics,Containers API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1beta1,PodMetrics,Containers diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index a857abcf155..3f130c0196c 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -585,6 +585,12 @@ const ( // Adds support for kubelet to detect node shutdown and gracefully terminate pods prior to the node being shutdown. GracefulNodeShutdown featuregate.Feature = "GracefulNodeShutdown" + // owner: @wzshiming + // alpha: v1.23 + // + // Make the kubelet use shutdown configuration based on pod priority values for graceful shutdown. + GracefulNodeShutdownBasedOnPodPriority featuregate.Feature = "GracefulNodeShutdownBasedOnPodPriority" + // owner: @andrewsykim @uablrek // kep: http://kep.k8s.io/1864 // alpha: v1.20 @@ -920,6 +926,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default and remove after v1.22 based on KEP #1972 update KubeletCredentialProviders: {Default: false, PreRelease: featuregate.Alpha}, GracefulNodeShutdown: {Default: true, PreRelease: featuregate.Beta}, + GracefulNodeShutdownBasedOnPodPriority: {Default: false, PreRelease: featuregate.Alpha}, ServiceLBNodePortControl: {Default: true, PreRelease: featuregate.Beta}, MixedProtocolLBService: {Default: false, PreRelease: featuregate.Alpha}, VolumeCapacityPriority: {Default: false, PreRelease: featuregate.Alpha}, diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index 4855b3e2a04..bf6557b70a3 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -397,6 +397,15 @@ type KubeletConfiguration struct { // +featureGate=GracefulNodeShutdown // +optional ShutdownGracePeriodCriticalPods metav1.Duration + // ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based + // on their associated priority class value. + // When a shutdown request is received, the Kubelet will initiate shutdown on all pods + // running on the node with a grace period that depends on the priority of the pod, + // and then wait for all pods to exit. + // Each entry in the array represents the graceful shutdown time a pod with a priority + // class value that lies in the range of that value and the next higher entry in the + // list when the node is shutting down. + ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority // ReservedMemory specifies a comma-separated list of memory reservations for NUMA nodes. // The parameter makes sense only in the context of the memory manager feature. The memory manager will not allocate reserved memory for container workloads. // For example, if you have a NUMA0 with 10Gi of memory and the ReservedMemory was specified to reserve 1Gi of memory at NUMA0, @@ -595,6 +604,14 @@ type MemoryReservation struct { Limits v1.ResourceList } +// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value +type ShutdownGracePeriodByPodPriority struct { + // priority is the priority value associated with the shutdown grace period + Priority int32 + // shutdownGracePeriodSeconds is the shutdown grace period in seconds + ShutdownGracePeriodSeconds int64 +} + type MemorySwapConfiguration struct { // swapBehavior configures swap memory available to container workloads. May be one of // "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index 7c4dbe4203d..95659af0de0 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -140,6 +140,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*v1beta1.ShutdownGracePeriodByPodPriority)(nil), (*config.ShutdownGracePeriodByPodPriority)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(a.(*v1beta1.ShutdownGracePeriodByPodPriority), b.(*config.ShutdownGracePeriodByPodPriority), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*config.ShutdownGracePeriodByPodPriority)(nil), (*v1beta1.ShutdownGracePeriodByPodPriority)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(a.(*config.ShutdownGracePeriodByPodPriority), b.(*v1beta1.ShutdownGracePeriodByPodPriority), scope) + }); err != nil { + return err + } return nil } @@ -381,6 +391,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in } out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + out.ShutdownGracePeriodByPodPriority = *(*[]config.ShutdownGracePeriodByPodPriority)(unsafe.Pointer(&in.ShutdownGracePeriodByPodPriority)) out.ReservedMemory = *(*[]config.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory)) if err := v1.Convert_Pointer_bool_To_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil { return err @@ -556,6 +567,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in } out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + out.ShutdownGracePeriodByPodPriority = *(*[]v1beta1.ShutdownGracePeriodByPodPriority)(unsafe.Pointer(&in.ShutdownGracePeriodByPodPriority)) out.ReservedMemory = *(*[]v1beta1.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory)) if err := v1.Convert_bool_To_Pointer_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil { return err @@ -708,3 +720,25 @@ func autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConf func Convert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in *config.SerializedNodeConfigSource, out *v1beta1.SerializedNodeConfigSource, s conversion.Scope) error { return autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in, out, s) } + +func autoConvert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in *v1beta1.ShutdownGracePeriodByPodPriority, out *config.ShutdownGracePeriodByPodPriority, s conversion.Scope) error { + out.Priority = in.Priority + out.ShutdownGracePeriodSeconds = in.ShutdownGracePeriodSeconds + return nil +} + +// Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority is an autogenerated conversion function. +func Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in *v1beta1.ShutdownGracePeriodByPodPriority, out *config.ShutdownGracePeriodByPodPriority, s conversion.Scope) error { + return autoConvert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in, out, s) +} + +func autoConvert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in *config.ShutdownGracePeriodByPodPriority, out *v1beta1.ShutdownGracePeriodByPodPriority, s conversion.Scope) error { + out.Priority = in.Priority + out.ShutdownGracePeriodSeconds = in.ShutdownGracePeriodSeconds + return nil +} + +// Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority is an autogenerated conversion function. +func Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in *config.ShutdownGracePeriodByPodPriority, out *v1beta1.ShutdownGracePeriodByPodPriority, s conversion.Scope) error { + return autoConvert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in, out, s) +} diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go index f483437bf2d..99c3c339335 100644 --- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go @@ -283,6 +283,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { in.Logging.DeepCopyInto(&out.Logging) out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + if in.ShutdownGracePeriodByPodPriority != nil { + in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority + *out = make([]ShutdownGracePeriodByPodPriority, len(*in)) + copy(*out, *in) + } if in.ReservedMemory != nil { in, out := &in.ReservedMemory, &out.ReservedMemory *out = make([]MemoryReservation, len(*in)) @@ -438,3 +443,19 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object { } return nil } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority. +func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority { + if in == nil { + return nil + } + out := new(ShutdownGracePeriodByPodPriority) + in.DeepCopyInto(out) + return out +} diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index 30eb22f7b77..223a88576c6 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -989,6 +989,35 @@ type KubeletConfiguration struct { // +featureGate=GracefulNodeShutdown // +optional ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"` + // shutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based + // on their associated priority class value. + // When a shutdown request is received, the Kubelet will initiate shutdown on all pods + // running on the node with a grace period that depends on the priority of the pod, + // and then wait for all pods to exit. + // Each entry in the array represents the graceful shutdown time a pod with a priority + // class value that lies in the range of that value and the next higher entry in the + // list when the node is shutting down. + // For example, to allow critical pods 10s to shutdown, priority>=10000 pods 20s to + // shutdown, and all remaining pods 30s to shutdown. + // + // shutdownGracePeriodByPodPriority: + // - priority: 2000000000 + // shutdownGracePeriodSeconds: 10 + // - priority: 10000 + // shutdownGracePeriodSeconds: 20 + // - priority: 0 + // shutdownGracePeriodSeconds: 30 + // + // The time the Kubelet will wait before exiting will at most be the maximum of all + // shutdownGracePeriodSeconds for each priority class range represented on the node. + // When all pods have exited or reached their grace periods, the Kubelet will release + // the shutdown inhibit lock. + // Requires the GracefulNodeShutdown feature gate to be enabled. + // This configuration must be empty if either ShutdownGracePeriod or ShutdownGracePeriodCriticalPods is set. + // Default: nil + // +featureGate=GracefulNodeShutdownBasedOnPodPriority + // +optional + ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority `json:"shutdownGracePeriodByPodPriority,omitempty"` // reservedMemory specifies a comma-separated list of memory reservations for NUMA nodes. // The parameter makes sense only in the context of the memory manager feature. // The memory manager will not allocate reserved memory for container workloads. @@ -1136,6 +1165,14 @@ type MemoryReservation struct { Limits v1.ResourceList `json:"limits"` } +// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value +type ShutdownGracePeriodByPodPriority struct { + // priority is the priority value associated with the shutdown grace period + Priority int32 `json:"priority"` + // shutdownGracePeriodSeconds is the shutdown grace period in seconds + ShutdownGracePeriodSeconds int64 `json:"shutdownGracePeriodSeconds"` +} + type MemorySwapConfiguration struct { // swapBehavior configures swap memory available to container workloads. May be one of // "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go index 114cba363f4..777c0588505 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go @@ -318,6 +318,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { } out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + if in.ShutdownGracePeriodByPodPriority != nil { + in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority + *out = make([]ShutdownGracePeriodByPodPriority, len(*in)) + copy(*out, *in) + } if in.ReservedMemory != nil { in, out := &in.ReservedMemory, &out.ReservedMemory *out = make([]MemoryReservation, len(*in)) @@ -498,3 +503,19 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object { } return nil } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority. +func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority { + if in == nil { + return nil + } + out := new(ShutdownGracePeriodByPodPriority) + in.DeepCopyInto(out) + return out +} From 545313bdc7393cefb9ffcc8fda3a054a20c6a8d4 Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Wed, 27 Oct 2021 11:11:14 +0800 Subject: [PATCH 2/5] Implement graceful shutdown based on Pod priority --- pkg/kubelet/apis/config/helpers_test.go | 2 + pkg/kubelet/kubelet.go | 17 +- .../nodeshutdown/nodeshutdown_manager.go | 20 +- .../nodeshutdown_manager_linux.go | 214 ++++++++++++----- .../nodeshutdown_manager_linux_test.go | 221 ++++++++++++++++-- 5 files changed, 384 insertions(+), 90 deletions(-) diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index 29361abd450..1a7725c984c 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -267,6 +267,8 @@ var ( "SeccompDefault", "SerializeImagePulls", "ShowHiddenMetricsForVersion", + "ShutdownGracePeriodByPodPriority[*].Priority", + "ShutdownGracePeriodByPodPriority[*].ShutdownGracePeriodSeconds", "StreamingConnectionIdleTimeout.Duration", "SyncFrequency.Duration", "SystemCgroups", diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index cbb79413977..2013c871a60 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -867,14 +867,15 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, // setup node shutdown manager shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{ - ProbeManager: klet.probeManager, - Recorder: kubeDeps.Recorder, - NodeRef: nodeRef, - GetPodsFunc: klet.GetActivePods, - KillPodFunc: killPodNow(klet.podWorkers, kubeDeps.Recorder), - SyncNodeStatusFunc: klet.syncNodeStatus, - ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration, - ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration, + ProbeManager: klet.probeManager, + Recorder: kubeDeps.Recorder, + NodeRef: nodeRef, + GetPodsFunc: klet.GetActivePods, + KillPodFunc: killPodNow(klet.podWorkers, kubeDeps.Recorder), + SyncNodeStatusFunc: klet.syncNodeStatus, + ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration, + ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration, + ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority, }) klet.shutdownManager = shutdownManager klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler) diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go index 4467ef773b5..4e4cfbfe5e9 100644 --- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go @@ -21,6 +21,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/client-go/tools/record" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/eviction" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/prober" @@ -36,15 +37,16 @@ type Manager interface { // Config represents Manager configuration type Config struct { - ProbeManager prober.Manager - Recorder record.EventRecorder - NodeRef *v1.ObjectReference - GetPodsFunc eviction.ActivePodsFunc - KillPodFunc eviction.KillPodFunc - SyncNodeStatusFunc func() - ShutdownGracePeriodRequested time.Duration - ShutdownGracePeriodCriticalPods time.Duration - Clock clock.Clock + ProbeManager prober.Manager + Recorder record.EventRecorder + NodeRef *v1.ObjectReference + GetPodsFunc eviction.ActivePodsFunc + KillPodFunc eviction.KillPodFunc + SyncNodeStatusFunc func() + ShutdownGracePeriodRequested time.Duration + ShutdownGracePeriodCriticalPods time.Duration + ShutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority + Clock clock.Clock } // managerStub is a fake node shutdown managerImpl . diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go index bc161387f95..bfbb841a1fa 100644 --- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go @@ -22,6 +22,7 @@ package nodeshutdown import ( "fmt" + "sort" "sync" "time" @@ -29,13 +30,14 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/apis/scheduling" "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubeletevents "k8s.io/kubernetes/pkg/kubelet/events" "k8s.io/kubernetes/pkg/kubelet/eviction" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd" "k8s.io/kubernetes/pkg/kubelet/prober" - kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/utils/clock" ) @@ -66,8 +68,7 @@ type managerImpl struct { nodeRef *v1.ObjectReference probeManager prober.Manager - shutdownGracePeriodRequested time.Duration - shutdownGracePeriodCriticalPods time.Duration + shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority getPods eviction.ActivePodsFunc killPodFunc eviction.KillPodFunc @@ -84,28 +85,46 @@ type managerImpl struct { // NewManager returns a new node shutdown manager. func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) { - if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) || - (conf.ShutdownGracePeriodRequested == 0 && conf.ShutdownGracePeriodCriticalPods == 0) { + if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) { m := managerStub{} return m, m } + + shutdownGracePeriodByPodPriority := conf.ShutdownGracePeriodByPodPriority + // Migration from the original configuration + if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) || + len(shutdownGracePeriodByPodPriority) == 0 { + shutdownGracePeriodByPodPriority = migrateConfig(conf.ShutdownGracePeriodRequested, conf.ShutdownGracePeriodCriticalPods) + } + + // Disable if the configuration is empty + if len(shutdownGracePeriodByPodPriority) == 0 { + m := managerStub{} + return m, m + } + + // Sort by priority from low to high + sort.Slice(shutdownGracePeriodByPodPriority, func(i, j int) bool { + return shutdownGracePeriodByPodPriority[i].Priority < shutdownGracePeriodByPodPriority[j].Priority + }) + if conf.Clock == nil { conf.Clock = clock.RealClock{} } manager := &managerImpl{ - probeManager: conf.ProbeManager, - recorder: conf.Recorder, - nodeRef: conf.NodeRef, - getPods: conf.GetPodsFunc, - killPodFunc: conf.KillPodFunc, - syncNodeStatus: conf.SyncNodeStatusFunc, - shutdownGracePeriodRequested: conf.ShutdownGracePeriodRequested, - shutdownGracePeriodCriticalPods: conf.ShutdownGracePeriodCriticalPods, - clock: conf.Clock, + probeManager: conf.ProbeManager, + recorder: conf.Recorder, + nodeRef: conf.NodeRef, + getPods: conf.GetPodsFunc, + killPodFunc: conf.KillPodFunc, + syncNodeStatus: conf.SyncNodeStatusFunc, + shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority, + clock: conf.Clock, } klog.InfoS("Creating node shutdown manager", "shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested, "shutdownGracePeriodCriticalPods", conf.ShutdownGracePeriodCriticalPods, + "shutdownGracePeriodByPodPriority", shutdownGracePeriodByPodPriority, ) return manager, manager } @@ -159,9 +178,9 @@ func (m *managerImpl) start() (chan struct{}, error) { return nil, err } - // If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than shutdownGracePeriodRequested, attempt to update the value to shutdownGracePeriodRequested. - if m.shutdownGracePeriodRequested > currentInhibitDelay { - err := m.dbusCon.OverrideInhibitDelay(m.shutdownGracePeriodRequested) + // If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than periodRequested, attempt to update the value to periodRequested. + if periodRequested := m.periodRequested(); periodRequested > currentInhibitDelay { + err := m.dbusCon.OverrideInhibitDelay(periodRequested) if err != nil { return nil, fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err) } @@ -177,8 +196,8 @@ func (m *managerImpl) start() (chan struct{}, error) { return nil, err } - if m.shutdownGracePeriodRequested > updatedInhibitDelay { - return nil, fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", m.shutdownGracePeriodRequested, updatedInhibitDelay) + if periodRequested > updatedInhibitDelay { + return nil, fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", periodRequested, updatedInhibitDelay) } } @@ -270,54 +289,54 @@ func (m *managerImpl) processShutdownEvent() error { klog.V(1).InfoS("Shutdown manager processing shutdown event") activePods := m.getPods() - nonCriticalPodGracePeriod := m.shutdownGracePeriodRequested - m.shutdownGracePeriodCriticalPods + groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods) + for _, group := range groups { + // If there are no pods in a particular range, + // then do not wait for pods in that priority range. + if len(group.Pods) == 0 { + continue + } - var wg sync.WaitGroup - wg.Add(len(activePods)) - for _, pod := range activePods { - go func(pod *v1.Pod) { - defer wg.Done() + var wg sync.WaitGroup + wg.Add(len(group.Pods)) + for _, pod := range group.Pods { + go func(pod *v1.Pod, group podShutdownGroup) { + defer wg.Done() - var gracePeriodOverride int64 - if kubelettypes.IsCriticalPod(pod) { - gracePeriodOverride = int64(m.shutdownGracePeriodCriticalPods.Seconds()) - m.clock.Sleep(nonCriticalPodGracePeriod) - } else { - gracePeriodOverride = int64(nonCriticalPodGracePeriod.Seconds()) - } + gracePeriodOverride := group.ShutdownGracePeriodSeconds - // Stop probes for the pod - m.probeManager.RemovePod(pod) + // Stop probes for the pod + m.probeManager.RemovePod(pod) - // If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod. - if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride { - gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds - } + // If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod. + if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride { + gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds + } - klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride) - if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) { - status.Message = nodeShutdownMessage - status.Reason = nodeShutdownReason - }); err != nil { - klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err) - } else { - klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod)) - } - }(pod) - } + klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride) - c := make(chan struct{}) - go func() { - defer close(c) - wg.Wait() - }() + if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) { + status.Message = nodeShutdownMessage + status.Reason = nodeShutdownReason + }); err != nil { + klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err) + } else { + klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod)) + } + }(pod, group) + } - // We want to ensure that inhibitLock is released, so only wait up to the shutdownGracePeriodRequested timeout. - select { - case <-c: - break - case <-time.After(m.shutdownGracePeriodRequested): - klog.V(1).InfoS("Shutdown manager pod killing time out", "gracePeriod", m.shutdownGracePeriodRequested) + c := make(chan struct{}) + go func() { + defer close(c) + wg.Wait() + }() + + select { + case <-c: + case <-time.After(time.Duration(group.ShutdownGracePeriodSeconds) * time.Second): + klog.V(1).InfoS("Shutdown manager pod killing time out", "gracePeriod", group.ShutdownGracePeriodSeconds, "priority", group.Priority) + } } m.dbusCon.ReleaseInhibitLock(m.inhibitLock) @@ -325,3 +344,78 @@ func (m *managerImpl) processShutdownEvent() error { return nil } + +func (m *managerImpl) periodRequested() time.Duration { + var sum int64 + for _, period := range m.shutdownGracePeriodByPodPriority { + sum += period.ShutdownGracePeriodSeconds + } + return time.Duration(sum) * time.Second +} + +func migrateConfig(shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) []kubeletconfig.ShutdownGracePeriodByPodPriority { + if shutdownGracePeriodRequested == 0 { + return nil + } + defaultPriority := shutdownGracePeriodRequested - shutdownGracePeriodCriticalPods + if defaultPriority < 0 { + return nil + } + criticalPriority := shutdownGracePeriodRequested - defaultPriority + if criticalPriority < 0 { + return nil + } + return []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists, + ShutdownGracePeriodSeconds: int64(defaultPriority / time.Second), + }, + { + Priority: scheduling.SystemCriticalPriority, + ShutdownGracePeriodSeconds: int64(criticalPriority / time.Second), + }, + } +} + +func groupByPriority(shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority, pods []*v1.Pod) []podShutdownGroup { + groups := make([]podShutdownGroup, 0, len(shutdownGracePeriodByPodPriority)) + for _, period := range shutdownGracePeriodByPodPriority { + groups = append(groups, podShutdownGroup{ + ShutdownGracePeriodByPodPriority: period, + }) + } + + for _, pod := range pods { + var priority int32 + if pod.Spec.Priority != nil { + priority = *pod.Spec.Priority + } + + // Find the group index according to the priority. + index := sort.Search(len(groups), func(i int) bool { + return groups[i].Priority >= priority + }) + + // 1. Those higher than the highest priority default to the highest priority + // 2. Those lower than the lowest priority default to the lowest priority + // 3. Those boundary priority default to the lower priority + // if priority of pod is: + // groups[index-1].Priority <= pod priority < groups[index].Priority + // in which case we want to pick lower one (i.e index-1) + if index == len(groups) { + index = len(groups) - 1 + } else if index < 0 { + index = 0 + } else if index > 0 && groups[index].Priority > priority { + index-- + } + + groups[index].Pods = append(groups[index].Pods, pod) + } + return groups +} + +type podShutdownGroup struct { + kubeletconfig.ShutdownGracePeriodByPodPriority + Pods []*v1.Pod +} diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go index dc0fd9bf10f..028364c2367 100644 --- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go @@ -35,6 +35,7 @@ import ( featuregatetesting "k8s.io/component-base/featuregate/testing" "k8s.io/kubernetes/pkg/apis/scheduling" pkgfeatures "k8s.io/kubernetes/pkg/features" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd" probetest "k8s.io/kubernetes/pkg/kubelet/prober/testing" testingclock "k8s.io/utils/clock/testing" @@ -81,12 +82,7 @@ func (f *fakeDbus) OverrideInhibitDelay(inhibitDelayMax time.Duration) error { return nil } -func makePod(name string, criticalPod bool, terminationGracePeriod *int64) *v1.Pod { - var priority int32 - if criticalPod { - priority = scheduling.SystemCriticalPriority - } - +func makePod(name string, priority int32, terminationGracePeriod *int64) *v1.Pod { return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: name, @@ -104,15 +100,15 @@ func TestManager(t *testing.T) { defer func() { systemDbus = systemDbusTmp }() - normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", false /* criticalPod */, nil /* terminationGracePeriod */) - criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", true /* criticalPod */, nil /* terminationGracePeriod */) + normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil /* terminationGracePeriod */) + criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", scheduling.SystemCriticalPriority, nil /* terminationGracePeriod */) shortGracePeriod := int64(2) - normalPodGracePeriod := makePod("normal-pod-grace-period", false /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */) - criticalPodGracePeriod := makePod("critical-pod-grace-period", true /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */) + normalPodGracePeriod := makePod("normal-pod-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, &shortGracePeriod /* terminationGracePeriod */) + criticalPodGracePeriod := makePod("critical-pod-grace-period", scheduling.SystemCriticalPriority, &shortGracePeriod /* terminationGracePeriod */) longGracePeriod := int64(1000) - normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", false /* criticalPod */, &longGracePeriod /* terminationGracePeriod */) + normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, &longGracePeriod /* terminationGracePeriod */) var tests = []struct { desc string @@ -256,7 +252,9 @@ func TestManager(t *testing.T) { lock.Unlock() if tc.expectedError != nil { - if !strings.Contains(err.Error(), tc.expectedError.Error()) { + if err == nil { + t.Errorf("unexpected error message. Got: want %s", tc.expectedError.Error()) + } else if !strings.Contains(err.Error(), tc.expectedError.Error()) { t.Errorf("unexpected error message. Got: %s want %s", err.Error(), tc.expectedError.Error()) } } else { @@ -266,7 +264,11 @@ func TestManager(t *testing.T) { assert.Equal(t, manager.Admit(nil).Admit, true) // Send fake shutdown event - fakeShutdownChan <- true + select { + case fakeShutdownChan <- true: + case <-time.After(1 * time.Second): + t.Fatal() + } // Wait for all the pods to be killed killedPodsToGracePeriods := map[string]int64{} @@ -413,3 +415,196 @@ func TestRestart(t *testing.T) { shutdownChanMut.Unlock() } } + +func Test_migrateConfig(t *testing.T) { + type shutdownConfig struct { + shutdownGracePeriodRequested time.Duration + shutdownGracePeriodCriticalPods time.Duration + } + tests := []struct { + name string + args shutdownConfig + want []kubeletconfig.ShutdownGracePeriodByPodPriority + }{ + { + name: "both shutdownGracePeriodRequested and shutdownGracePeriodCriticalPods", + args: shutdownConfig{ + shutdownGracePeriodRequested: 300 * time.Second, + shutdownGracePeriodCriticalPods: 120 * time.Second, + }, + want: []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists, + ShutdownGracePeriodSeconds: 180, + }, + { + Priority: scheduling.SystemCriticalPriority, + ShutdownGracePeriodSeconds: 120, + }, + }, + }, + { + name: "only shutdownGracePeriodRequested", + args: shutdownConfig{ + shutdownGracePeriodRequested: 100 * time.Second, + shutdownGracePeriodCriticalPods: 0 * time.Second, + }, + want: []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists, + ShutdownGracePeriodSeconds: 100, + }, + { + Priority: scheduling.SystemCriticalPriority, + ShutdownGracePeriodSeconds: 0, + }, + }, + }, + { + name: "empty configuration", + args: shutdownConfig{ + shutdownGracePeriodRequested: 0 * time.Second, + shutdownGracePeriodCriticalPods: 0 * time.Second, + }, + want: nil, + }, + { + name: "wrong configuration", + args: shutdownConfig{ + shutdownGracePeriodRequested: 1 * time.Second, + shutdownGracePeriodCriticalPods: 100 * time.Second, + }, + want: nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := migrateConfig(tt.args.shutdownGracePeriodRequested, tt.args.shutdownGracePeriodCriticalPods); !assert.Equal(t, tt.want, got) { + t.Errorf("migrateConfig() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_groupByPriority(t *testing.T) { + type args struct { + shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority + pods []*v1.Pod + } + tests := []struct { + name string + args args + want []podShutdownGroup + }{ + { + name: "migrate config", + args: args{ + shutdownGracePeriodByPodPriority: migrateConfig(300*time.Second /* shutdownGracePeriodRequested */, 120*time.Second /* shutdownGracePeriodCriticalPods */), + pods: []*v1.Pod{ + makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil), + makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil), + makePod("critical-pod", scheduling.SystemCriticalPriority, nil), + }, + }, + want: []podShutdownGroup{ + { + ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{ + Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists, + ShutdownGracePeriodSeconds: 180, + }, + Pods: []*v1.Pod{ + makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil), + makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil), + }, + }, + { + ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{ + Priority: scheduling.SystemCriticalPriority, + ShutdownGracePeriodSeconds: 120, + }, + Pods: []*v1.Pod{ + makePod("critical-pod", scheduling.SystemCriticalPriority, nil), + }, + }, + }, + }, + { + name: "pod priority", + args: args{ + shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: 1, + ShutdownGracePeriodSeconds: 10, + }, + { + Priority: 2, + ShutdownGracePeriodSeconds: 20, + }, + { + Priority: 3, + ShutdownGracePeriodSeconds: 30, + }, + { + Priority: 4, + ShutdownGracePeriodSeconds: 40, + }, + }, + pods: []*v1.Pod{ + makePod("pod-0", 0, nil), + makePod("pod-1", 1, nil), + makePod("pod-2", 2, nil), + makePod("pod-3", 3, nil), + makePod("pod-4", 4, nil), + makePod("pod-5", 5, nil), + }, + }, + want: []podShutdownGroup{ + { + ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{ + Priority: 1, + ShutdownGracePeriodSeconds: 10, + }, + Pods: []*v1.Pod{ + makePod("pod-0", 0, nil), + makePod("pod-1", 1, nil), + }, + }, + { + ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{ + Priority: 2, + ShutdownGracePeriodSeconds: 20, + }, + Pods: []*v1.Pod{ + makePod("pod-2", 2, nil), + }, + }, + { + ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{ + Priority: 3, + ShutdownGracePeriodSeconds: 30, + }, + Pods: []*v1.Pod{ + makePod("pod-3", 3, nil), + }, + }, + { + ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{ + Priority: 4, + ShutdownGracePeriodSeconds: 40, + }, + Pods: []*v1.Pod{ + makePod("pod-4", 4, nil), + makePod("pod-5", 5, nil), + }, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := groupByPriority(tt.args.shutdownGracePeriodByPodPriority, tt.args.pods); !assert.Equal(t, tt.want, got) { + t.Errorf("groupByPriority() = %v, want %v", got, tt.want) + } + }) + } +} From 7a6f792ff39959d1c00d57dbf969901a7d323df9 Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Thu, 11 Nov 2021 14:19:02 +0800 Subject: [PATCH 3/5] Add validation for GracefulNodeShutdownBasedOnPodPriority Co-authored-by: Elana Hashman --- .../apis/config/validation/validation.go | 10 ++++++ .../apis/config/validation/validation_test.go | 34 +++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/pkg/kubelet/apis/config/validation/validation.go b/pkg/kubelet/apis/config/validation/validation.go index 585e6639af5..7a89f87ddd7 100644 --- a/pkg/kubelet/apis/config/validation/validation.go +++ b/pkg/kubelet/apis/config/validation/validation.go @@ -166,6 +166,16 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) { allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown")) } + if localFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) { + if len(kc.ShutdownGracePeriodByPodPriority) != 0 && (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: Cannot specify both shutdownGracePeriodByPodPriority and shutdownGracePeriod at the same time")) + } + } + if !localFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) { + if len(kc.ShutdownGracePeriodByPodPriority) != 0 { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying shutdownGracePeriodByPodPriority requires feature gate GracefulNodeShutdownBasedOnPodPriority")) + } + } if localFeatureGate.Enabled(features.NodeSwap) { if kc.MemorySwap.SwapBehavior != "" && kc.MemorySwap.SwapBehavior != kubetypes.LimitedSwap && kc.MemorySwap.SwapBehavior != kubetypes.UnlimitedSwap { allErrors = append(allErrors, fmt.Errorf("invalid configuration: MemorySwap.SwapBehavior %v must be one of: LimitedSwap, UnlimitedSwap", kc.MemorySwap.SwapBehavior)) diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go index eb346dc6a98..2128cbbe423 100644 --- a/pkg/kubelet/apis/config/validation/validation_test.go +++ b/pkg/kubelet/apis/config/validation/validation_test.go @@ -108,8 +108,9 @@ func TestValidateKubeletConfiguration(t *testing.T) { ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, MemoryThrottlingFactor: utilpointer.Float64Ptr(0.9), FeatureGates: map[string]bool{ - "CustomCPUCFSQuotaPeriod": true, - "MemoryQoS": true, + "CustomCPUCFSQuotaPeriod": true, + "MemoryQoS": true, + "GracefulNodeShutdownBasedOnPodPriority": true, }, Logging: componentbaseconfig.LoggingConfiguration{ Format: "text", @@ -149,15 +150,22 @@ func TestValidateKubeletConfiguration(t *testing.T) { ReservedSystemCPUs: "0-3", TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope, TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy, - ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute}, + ShutdownGracePeriod: metav1.Duration{Duration: 0}, ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, - MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap}, - MemoryThrottlingFactor: utilpointer.Float64Ptr(0.5), + ShutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: 0, + ShutdownGracePeriodSeconds: 10, + }, + }, + MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap}, + MemoryThrottlingFactor: utilpointer.Float64Ptr(0.5), FeatureGates: map[string]bool{ - "CustomCPUCFSQuotaPeriod": true, - "GracefulNodeShutdown": true, - "NodeSwap": true, - "MemoryQoS": true, + "CustomCPUCFSQuotaPeriod": true, + "GracefulNodeShutdown": true, + "GracefulNodeShutdownBasedOnPodPriority": true, + "NodeSwap": true, + "MemoryQoS": true, }, Logging: componentbaseconfig.LoggingConfiguration{ Format: "text", @@ -194,12 +202,18 @@ func TestValidateKubeletConfiguration(t *testing.T) { CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond}, ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second}, ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 60 * time.Second}, + ShutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: 0, + ShutdownGracePeriodSeconds: 10, + }, + }, Logging: componentbaseconfig.LoggingConfiguration{ Format: "", }, MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap}, } - const numErrsErrorCase1 = 30 + const numErrsErrorCase1 = 31 if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 { t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors())) } From df7e4c1a3d7916f37c98265d27a307b4a91fbb28 Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Fri, 12 Nov 2021 16:49:33 +0800 Subject: [PATCH 4/5] Add e2e for GracefulNodeShutdownBasedOnPodPriority --- test/e2e_node/node_shutdown_linux_test.go | 209 ++++++++++++++++++++-- 1 file changed, 198 insertions(+), 11 deletions(-) diff --git a/test/e2e_node/node_shutdown_linux_test.go b/test/e2e_node/node_shutdown_linux_test.go index 558b0cb782a..ff616aeb949 100644 --- a/test/e2e_node/node_shutdown_linux_test.go +++ b/test/e2e_node/node_shutdown_linux_test.go @@ -35,13 +35,15 @@ import ( "k8s.io/kubernetes/test/e2e/framework" v1 "k8s.io/api/core/v1" + schedulingv1 "k8s.io/api/scheduling/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/pkg/features" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" testutils "k8s.io/kubernetes/test/utils" ) -var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown]", func() { +var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown] [NodeFeature:GracefulNodeShutdownBasedOnPodPriority]", func() { f := framework.NewDefaultFramework("graceful-node-shutdown") ginkgo.Context("when gracefully shutting down", func() { @@ -54,6 +56,10 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut ) tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.FeatureGates = map[string]bool{ + string(features.GracefulNodeShutdown): true, + string(features.GracefulNodeShutdownBasedOnPodPriority): false, + } initialConfig.ShutdownGracePeriod = metav1.Duration{Duration: nodeShutdownGracePeriod} initialConfig.ShutdownGracePeriodCriticalPods = metav1.Duration{Duration: nodeShutdownGracePeriodCriticalPods} }) @@ -77,10 +83,10 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut // Define test pods pods := []*v1.Pod{ - getGracePeriodOverrideTestPod("period-120", nodeName, 120, false), - getGracePeriodOverrideTestPod("period-5", nodeName, 5, false), - getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, true), - getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, true), + getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""), + getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""), + getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical), + getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical), } ginkgo.By("Creating batch pods") @@ -117,12 +123,12 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut for _, pod := range list.Items { if kubelettypes.IsCriticalPod(&pod) { if isPodShutdown(&pod) { - framework.Logf("Expecting critcal pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status) + framework.Logf("Expecting critical pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status) return fmt.Errorf("critical pod should not be shutdown, phase: %s", pod.Status.Phase) } } else { if !isPodShutdown(&pod) { - framework.Logf("Expecting non-critcal pod to be shutdown, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status) + framework.Logf("Expecting non-critical pod to be shutdown, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status) return fmt.Errorf("pod should be shutdown, phase: %s", pod.Status.Phase) } } @@ -207,9 +213,190 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut }, nodeStatusUpdateTimeout, pollInterval).Should(gomega.BeNil()) }) }) + + ginkgo.Context("when gracefully shutting down with Pod priority", func() { + + const ( + pollInterval = 1 * time.Second + podStatusUpdateTimeout = 10 * time.Second + ) + + var ( + customClassA = getPriorityClass("custom-class-a", 100000) + customClassB = getPriorityClass("custom-class-b", 10000) + customClassC = getPriorityClass("custom-class-c", 1000) + ) + + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.FeatureGates = map[string]bool{ + string(features.GracefulNodeShutdown): true, + string(features.GracefulNodeShutdownBasedOnPodPriority): true, + } + initialConfig.ShutdownGracePeriodByPodPriority = []kubeletconfig.ShutdownGracePeriodByPodPriority{ + { + Priority: scheduling.SystemCriticalPriority, + ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second), + }, + { + Priority: customClassA.Value, + ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second), + }, + { + Priority: customClassB.Value, + ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second), + }, + { + Priority: customClassC.Value, + ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second), + }, + { + Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists, + ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second), + }, + } + + }) + + ginkgo.BeforeEach(func() { + ginkgo.By("Wait for the node to be ready") + waitForNodeReady() + + for _, customClass := range []*schedulingv1.PriorityClass{customClassA, customClassB, customClassC} { + _, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(context.Background(), customClass, metav1.CreateOptions{}) + framework.ExpectNoError(err) + } + }) + + ginkgo.AfterEach(func() { + ginkgo.By("Emitting Shutdown false signal; cancelling the shutdown") + err := emitSignalPrepareForShutdown(false) + framework.ExpectNoError(err) + }) + + ginkgo.It("should be able to gracefully shutdown pods with various grace periods", func() { + nodeName := getNodeName(f) + nodeSelector := fields.Set{ + "spec.nodeName": nodeName, + }.AsSelector().String() + + // Define test pods + pods := []*v1.Pod{ + getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""), + getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""), + getGracePeriodOverrideTestPod("period-c-120", nodeName, 120, customClassC.Name), + getGracePeriodOverrideTestPod("period-c-5", nodeName, 5, customClassC.Name), + getGracePeriodOverrideTestPod("period-b-120", nodeName, 120, customClassB.Name), + getGracePeriodOverrideTestPod("period-b-5", nodeName, 5, customClassB.Name), + getGracePeriodOverrideTestPod("period-a-120", nodeName, 120, customClassA.Name), + getGracePeriodOverrideTestPod("period-a-5", nodeName, 5, customClassA.Name), + getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical), + getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical), + } + + // Expected down steps + downSteps := [][]string{ + { + "period-5", "period-120", + }, + { + "period-5", "period-120", + "period-c-5", "period-c-120", + }, + { + "period-5", "period-120", + "period-c-5", "period-c-120", + "period-b-5", "period-b-120", + }, + { + "period-5", "period-120", + "period-c-5", "period-c-120", + "period-b-5", "period-b-120", + "period-a-5", "period-a-120", + }, + { + "period-5", "period-120", + "period-c-5", "period-c-120", + "period-b-5", "period-b-120", + "period-a-5", "period-a-120", + "period-critical-5", "period-critical-120", + }, + } + + ginkgo.By("Creating batch pods") + f.PodClient().CreateBatch(pods) + + list, err := f.PodClient().List(context.TODO(), metav1.ListOptions{ + FieldSelector: nodeSelector, + }) + framework.ExpectNoError(err) + framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected") + + ginkgo.By("Verifying batch pods are running") + for _, pod := range list.Items { + if podReady, err := testutils.PodRunningReady(&pod); err != nil || !podReady { + framework.Failf("Failed to start batch pod: %v", pod.Name) + } + } + + ginkgo.By("Emitting shutdown signal") + err = emitSignalPrepareForShutdown(true) + framework.ExpectNoError(err) + + ginkgo.By("Verifying that pods are shutdown") + + for _, step := range downSteps { + gomega.Eventually(func() error { + list, err = f.PodClient().List(context.TODO(), metav1.ListOptions{ + FieldSelector: nodeSelector, + }) + if err != nil { + return err + } + framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected") + for _, pod := range list.Items { + shouldShutdown := false + for _, podName := range step { + if podName == pod.Name { + shouldShutdown = true + break + } + } + if !shouldShutdown { + if pod.Status.Phase != v1.PodRunning { + framework.Logf("Expecting pod to be running, but it's not currently. Pod: %q, Pod Status Phase: %q, Pod Status Reason: %q", pod.Name, pod.Status.Phase, pod.Status.Reason) + return fmt.Errorf("pod should not be shutdown, phase: %s, reason: %s", pod.Status.Phase, pod.Status.Reason) + } + } else { + if pod.Status.Reason != podShutdownReason { + framework.Logf("Expecting pod to be shutdown, but it's not currently. Pod: %q, Pod Status Phase: %q, Pod Status Reason: %q", pod.Name, pod.Status.Phase, pod.Status.Reason) + for _, item := range list.Items { + framework.Logf("DEBUG %s, %s, %s", item.Name, item.Status.Phase, pod.Status.Reason) + } + return fmt.Errorf("pod should be shutdown, reason: %s", pod.Status.Reason) + } + } + } + return nil + }, podStatusUpdateTimeout, pollInterval).Should(gomega.BeNil()) + } + }) + }) }) -func getGracePeriodOverrideTestPod(name string, node string, gracePeriod int64, critical bool) *v1.Pod { +func getPriorityClass(name string, value int32) *schedulingv1.PriorityClass { + priority := &schedulingv1.PriorityClass{ + TypeMeta: metav1.TypeMeta{ + Kind: "PriorityClass", + APIVersion: "scheduling.k8s.io/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Value: value, + } + return priority +} +func getGracePeriodOverrideTestPod(name string, node string, gracePeriod int64, priorityClassName string) *v1.Pod { pod := &v1.Pod{ TypeMeta: metav1.TypeMeta{ Kind: "Pod", @@ -238,14 +425,14 @@ while true; do sleep 5; done NodeName: node, }, } - if critical { + if priorityClassName == scheduling.SystemNodeCritical { pod.ObjectMeta.Annotations = map[string]string{ kubelettypes.ConfigSourceAnnotationKey: kubelettypes.FileSource, } - pod.Spec.PriorityClassName = scheduling.SystemNodeCritical - + pod.Spec.PriorityClassName = priorityClassName framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), true, "pod should be a critical pod") } else { + pod.Spec.PriorityClassName = priorityClassName framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), false, "pod should not be a critical pod") } return pod From 7c656b55e49078918bebb7b58d8b9463c7f79fa3 Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Fri, 12 Nov 2021 16:50:18 +0800 Subject: [PATCH 5/5] Update shutdown cases --- test/e2e_node/node_shutdown_linux_test.go | 35 ++++++++++------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/test/e2e_node/node_shutdown_linux_test.go b/test/e2e_node/node_shutdown_linux_test.go index ff616aeb949..eca0e681513 100644 --- a/test/e2e_node/node_shutdown_linux_test.go +++ b/test/e2e_node/node_shutdown_linux_test.go @@ -281,44 +281,39 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut // Define test pods pods := []*v1.Pod{ - getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""), getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""), - getGracePeriodOverrideTestPod("period-c-120", nodeName, 120, customClassC.Name), getGracePeriodOverrideTestPod("period-c-5", nodeName, 5, customClassC.Name), - getGracePeriodOverrideTestPod("period-b-120", nodeName, 120, customClassB.Name), getGracePeriodOverrideTestPod("period-b-5", nodeName, 5, customClassB.Name), - getGracePeriodOverrideTestPod("period-a-120", nodeName, 120, customClassA.Name), getGracePeriodOverrideTestPod("period-a-5", nodeName, 5, customClassA.Name), - getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical), getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical), } // Expected down steps downSteps := [][]string{ { - "period-5", "period-120", + "period-5", }, { - "period-5", "period-120", - "period-c-5", "period-c-120", + "period-5", + "period-c-5", }, { - "period-5", "period-120", - "period-c-5", "period-c-120", - "period-b-5", "period-b-120", + "period-5", + "period-c-5", + "period-b-5", }, { - "period-5", "period-120", - "period-c-5", "period-c-120", - "period-b-5", "period-b-120", - "period-a-5", "period-a-120", + "period-5", + "period-c-5", + "period-b-5", + "period-a-5", }, { - "period-5", "period-120", - "period-c-5", "period-c-120", - "period-b-5", "period-b-120", - "period-a-5", "period-a-120", - "period-critical-5", "period-critical-120", + "period-5", + "period-c-5", + "period-b-5", + "period-a-5", + "period-critical-5", }, }