mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 12:43:23 +00:00
Merge pull request #102915 from wzshiming/feat/graceful-shutdown-based-on-pod-priority
Graceful Node Shutdown Based On Pod Priority
This commit is contained in:
commit
91b7fb4dc9
@ -381,6 +381,7 @@ API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfi
|
|||||||
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,EnforceNodeAllocatable
|
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,EnforceNodeAllocatable
|
||||||
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,RegisterWithTaints
|
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,RegisterWithTaints
|
||||||
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ReservedMemory
|
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ReservedMemory
|
||||||
|
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ShutdownGracePeriodByPodPriority
|
||||||
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,TLSCipherSuites
|
API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,TLSCipherSuites
|
||||||
API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1alpha1,PodMetrics,Containers
|
API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1alpha1,PodMetrics,Containers
|
||||||
API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1beta1,PodMetrics,Containers
|
API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1beta1,PodMetrics,Containers
|
||||||
|
@ -585,6 +585,12 @@ const (
|
|||||||
// Adds support for kubelet to detect node shutdown and gracefully terminate pods prior to the node being shutdown.
|
// Adds support for kubelet to detect node shutdown and gracefully terminate pods prior to the node being shutdown.
|
||||||
GracefulNodeShutdown featuregate.Feature = "GracefulNodeShutdown"
|
GracefulNodeShutdown featuregate.Feature = "GracefulNodeShutdown"
|
||||||
|
|
||||||
|
// owner: @wzshiming
|
||||||
|
// alpha: v1.23
|
||||||
|
//
|
||||||
|
// Make the kubelet use shutdown configuration based on pod priority values for graceful shutdown.
|
||||||
|
GracefulNodeShutdownBasedOnPodPriority featuregate.Feature = "GracefulNodeShutdownBasedOnPodPriority"
|
||||||
|
|
||||||
// owner: @andrewsykim @uablrek
|
// owner: @andrewsykim @uablrek
|
||||||
// kep: http://kep.k8s.io/1864
|
// kep: http://kep.k8s.io/1864
|
||||||
// alpha: v1.20
|
// alpha: v1.20
|
||||||
@ -927,6 +933,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
|
|||||||
ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default and remove after v1.22 based on KEP #1972 update
|
ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default and remove after v1.22 based on KEP #1972 update
|
||||||
KubeletCredentialProviders: {Default: false, PreRelease: featuregate.Alpha},
|
KubeletCredentialProviders: {Default: false, PreRelease: featuregate.Alpha},
|
||||||
GracefulNodeShutdown: {Default: true, PreRelease: featuregate.Beta},
|
GracefulNodeShutdown: {Default: true, PreRelease: featuregate.Beta},
|
||||||
|
GracefulNodeShutdownBasedOnPodPriority: {Default: false, PreRelease: featuregate.Alpha},
|
||||||
ServiceLBNodePortControl: {Default: true, PreRelease: featuregate.Beta},
|
ServiceLBNodePortControl: {Default: true, PreRelease: featuregate.Beta},
|
||||||
MixedProtocolLBService: {Default: false, PreRelease: featuregate.Alpha},
|
MixedProtocolLBService: {Default: false, PreRelease: featuregate.Alpha},
|
||||||
VolumeCapacityPriority: {Default: false, PreRelease: featuregate.Alpha},
|
VolumeCapacityPriority: {Default: false, PreRelease: featuregate.Alpha},
|
||||||
|
@ -267,6 +267,8 @@ var (
|
|||||||
"SeccompDefault",
|
"SeccompDefault",
|
||||||
"SerializeImagePulls",
|
"SerializeImagePulls",
|
||||||
"ShowHiddenMetricsForVersion",
|
"ShowHiddenMetricsForVersion",
|
||||||
|
"ShutdownGracePeriodByPodPriority[*].Priority",
|
||||||
|
"ShutdownGracePeriodByPodPriority[*].ShutdownGracePeriodSeconds",
|
||||||
"StreamingConnectionIdleTimeout.Duration",
|
"StreamingConnectionIdleTimeout.Duration",
|
||||||
"SyncFrequency.Duration",
|
"SyncFrequency.Duration",
|
||||||
"SystemCgroups",
|
"SystemCgroups",
|
||||||
|
@ -397,6 +397,15 @@ type KubeletConfiguration struct {
|
|||||||
// +featureGate=GracefulNodeShutdown
|
// +featureGate=GracefulNodeShutdown
|
||||||
// +optional
|
// +optional
|
||||||
ShutdownGracePeriodCriticalPods metav1.Duration
|
ShutdownGracePeriodCriticalPods metav1.Duration
|
||||||
|
// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based
|
||||||
|
// on their associated priority class value.
|
||||||
|
// When a shutdown request is received, the Kubelet will initiate shutdown on all pods
|
||||||
|
// running on the node with a grace period that depends on the priority of the pod,
|
||||||
|
// and then wait for all pods to exit.
|
||||||
|
// Each entry in the array represents the graceful shutdown time a pod with a priority
|
||||||
|
// class value that lies in the range of that value and the next higher entry in the
|
||||||
|
// list when the node is shutting down.
|
||||||
|
ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority
|
||||||
// ReservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
|
// ReservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
|
||||||
// The parameter makes sense only in the context of the memory manager feature. The memory manager will not allocate reserved memory for container workloads.
|
// The parameter makes sense only in the context of the memory manager feature. The memory manager will not allocate reserved memory for container workloads.
|
||||||
// For example, if you have a NUMA0 with 10Gi of memory and the ReservedMemory was specified to reserve 1Gi of memory at NUMA0,
|
// For example, if you have a NUMA0 with 10Gi of memory and the ReservedMemory was specified to reserve 1Gi of memory at NUMA0,
|
||||||
@ -595,6 +604,14 @@ type MemoryReservation struct {
|
|||||||
Limits v1.ResourceList
|
Limits v1.ResourceList
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value
|
||||||
|
type ShutdownGracePeriodByPodPriority struct {
|
||||||
|
// priority is the priority value associated with the shutdown grace period
|
||||||
|
Priority int32
|
||||||
|
// shutdownGracePeriodSeconds is the shutdown grace period in seconds
|
||||||
|
ShutdownGracePeriodSeconds int64
|
||||||
|
}
|
||||||
|
|
||||||
type MemorySwapConfiguration struct {
|
type MemorySwapConfiguration struct {
|
||||||
// swapBehavior configures swap memory available to container workloads. May be one of
|
// swapBehavior configures swap memory available to container workloads. May be one of
|
||||||
// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
|
// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
|
||||||
|
@ -140,6 +140,16 @@ func RegisterConversions(s *runtime.Scheme) error {
|
|||||||
}); err != nil {
|
}); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
if err := s.AddGeneratedConversionFunc((*v1beta1.ShutdownGracePeriodByPodPriority)(nil), (*config.ShutdownGracePeriodByPodPriority)(nil), func(a, b interface{}, scope conversion.Scope) error {
|
||||||
|
return Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(a.(*v1beta1.ShutdownGracePeriodByPodPriority), b.(*config.ShutdownGracePeriodByPodPriority), scope)
|
||||||
|
}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := s.AddGeneratedConversionFunc((*config.ShutdownGracePeriodByPodPriority)(nil), (*v1beta1.ShutdownGracePeriodByPodPriority)(nil), func(a, b interface{}, scope conversion.Scope) error {
|
||||||
|
return Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(a.(*config.ShutdownGracePeriodByPodPriority), b.(*v1beta1.ShutdownGracePeriodByPodPriority), scope)
|
||||||
|
}); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -381,6 +391,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
|
|||||||
}
|
}
|
||||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||||
|
out.ShutdownGracePeriodByPodPriority = *(*[]config.ShutdownGracePeriodByPodPriority)(unsafe.Pointer(&in.ShutdownGracePeriodByPodPriority))
|
||||||
out.ReservedMemory = *(*[]config.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory))
|
out.ReservedMemory = *(*[]config.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory))
|
||||||
if err := v1.Convert_Pointer_bool_To_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil {
|
if err := v1.Convert_Pointer_bool_To_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -556,6 +567,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
|
|||||||
}
|
}
|
||||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||||
|
out.ShutdownGracePeriodByPodPriority = *(*[]v1beta1.ShutdownGracePeriodByPodPriority)(unsafe.Pointer(&in.ShutdownGracePeriodByPodPriority))
|
||||||
out.ReservedMemory = *(*[]v1beta1.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory))
|
out.ReservedMemory = *(*[]v1beta1.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory))
|
||||||
if err := v1.Convert_bool_To_Pointer_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil {
|
if err := v1.Convert_bool_To_Pointer_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil {
|
||||||
return err
|
return err
|
||||||
@ -708,3 +720,25 @@ func autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConf
|
|||||||
func Convert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in *config.SerializedNodeConfigSource, out *v1beta1.SerializedNodeConfigSource, s conversion.Scope) error {
|
func Convert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in *config.SerializedNodeConfigSource, out *v1beta1.SerializedNodeConfigSource, s conversion.Scope) error {
|
||||||
return autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in, out, s)
|
return autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in, out, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func autoConvert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in *v1beta1.ShutdownGracePeriodByPodPriority, out *config.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
|
||||||
|
out.Priority = in.Priority
|
||||||
|
out.ShutdownGracePeriodSeconds = in.ShutdownGracePeriodSeconds
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority is an autogenerated conversion function.
|
||||||
|
func Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in *v1beta1.ShutdownGracePeriodByPodPriority, out *config.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
|
||||||
|
return autoConvert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in, out, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func autoConvert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in *config.ShutdownGracePeriodByPodPriority, out *v1beta1.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
|
||||||
|
out.Priority = in.Priority
|
||||||
|
out.ShutdownGracePeriodSeconds = in.ShutdownGracePeriodSeconds
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority is an autogenerated conversion function.
|
||||||
|
func Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in *config.ShutdownGracePeriodByPodPriority, out *v1beta1.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
|
||||||
|
return autoConvert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in, out, s)
|
||||||
|
}
|
||||||
|
@ -166,6 +166,16 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error
|
|||||||
if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown"))
|
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown"))
|
||||||
}
|
}
|
||||||
|
if localFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) {
|
||||||
|
if len(kc.ShutdownGracePeriodByPodPriority) != 0 && (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) {
|
||||||
|
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Cannot specify both shutdownGracePeriodByPodPriority and shutdownGracePeriod at the same time"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !localFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) {
|
||||||
|
if len(kc.ShutdownGracePeriodByPodPriority) != 0 {
|
||||||
|
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying shutdownGracePeriodByPodPriority requires feature gate GracefulNodeShutdownBasedOnPodPriority"))
|
||||||
|
}
|
||||||
|
}
|
||||||
if localFeatureGate.Enabled(features.NodeSwap) {
|
if localFeatureGate.Enabled(features.NodeSwap) {
|
||||||
if kc.MemorySwap.SwapBehavior != "" && kc.MemorySwap.SwapBehavior != kubetypes.LimitedSwap && kc.MemorySwap.SwapBehavior != kubetypes.UnlimitedSwap {
|
if kc.MemorySwap.SwapBehavior != "" && kc.MemorySwap.SwapBehavior != kubetypes.LimitedSwap && kc.MemorySwap.SwapBehavior != kubetypes.UnlimitedSwap {
|
||||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: MemorySwap.SwapBehavior %v must be one of: LimitedSwap, UnlimitedSwap", kc.MemorySwap.SwapBehavior))
|
allErrors = append(allErrors, fmt.Errorf("invalid configuration: MemorySwap.SwapBehavior %v must be one of: LimitedSwap, UnlimitedSwap", kc.MemorySwap.SwapBehavior))
|
||||||
|
@ -108,8 +108,9 @@ func TestValidateKubeletConfiguration(t *testing.T) {
|
|||||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
|
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
|
||||||
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.9),
|
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.9),
|
||||||
FeatureGates: map[string]bool{
|
FeatureGates: map[string]bool{
|
||||||
"CustomCPUCFSQuotaPeriod": true,
|
"CustomCPUCFSQuotaPeriod": true,
|
||||||
"MemoryQoS": true,
|
"MemoryQoS": true,
|
||||||
|
"GracefulNodeShutdownBasedOnPodPriority": true,
|
||||||
},
|
},
|
||||||
Logging: componentbaseconfig.LoggingConfiguration{
|
Logging: componentbaseconfig.LoggingConfiguration{
|
||||||
Format: "text",
|
Format: "text",
|
||||||
@ -149,15 +150,22 @@ func TestValidateKubeletConfiguration(t *testing.T) {
|
|||||||
ReservedSystemCPUs: "0-3",
|
ReservedSystemCPUs: "0-3",
|
||||||
TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope,
|
TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope,
|
||||||
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
|
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
|
||||||
ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute},
|
ShutdownGracePeriod: metav1.Duration{Duration: 0},
|
||||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
|
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
|
||||||
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
|
ShutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.5),
|
{
|
||||||
|
Priority: 0,
|
||||||
|
ShutdownGracePeriodSeconds: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
|
||||||
|
MemoryThrottlingFactor: utilpointer.Float64Ptr(0.5),
|
||||||
FeatureGates: map[string]bool{
|
FeatureGates: map[string]bool{
|
||||||
"CustomCPUCFSQuotaPeriod": true,
|
"CustomCPUCFSQuotaPeriod": true,
|
||||||
"GracefulNodeShutdown": true,
|
"GracefulNodeShutdown": true,
|
||||||
"NodeSwap": true,
|
"GracefulNodeShutdownBasedOnPodPriority": true,
|
||||||
"MemoryQoS": true,
|
"NodeSwap": true,
|
||||||
|
"MemoryQoS": true,
|
||||||
},
|
},
|
||||||
Logging: componentbaseconfig.LoggingConfiguration{
|
Logging: componentbaseconfig.LoggingConfiguration{
|
||||||
Format: "text",
|
Format: "text",
|
||||||
@ -194,12 +202,18 @@ func TestValidateKubeletConfiguration(t *testing.T) {
|
|||||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond},
|
CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond},
|
||||||
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
|
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
|
||||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 60 * time.Second},
|
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 60 * time.Second},
|
||||||
|
ShutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: 0,
|
||||||
|
ShutdownGracePeriodSeconds: 10,
|
||||||
|
},
|
||||||
|
},
|
||||||
Logging: componentbaseconfig.LoggingConfiguration{
|
Logging: componentbaseconfig.LoggingConfiguration{
|
||||||
Format: "",
|
Format: "",
|
||||||
},
|
},
|
||||||
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
|
MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
|
||||||
}
|
}
|
||||||
const numErrsErrorCase1 = 30
|
const numErrsErrorCase1 = 31
|
||||||
if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 {
|
if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 {
|
||||||
t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors()))
|
t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors()))
|
||||||
}
|
}
|
||||||
|
21
pkg/kubelet/apis/config/zz_generated.deepcopy.go
generated
21
pkg/kubelet/apis/config/zz_generated.deepcopy.go
generated
@ -283,6 +283,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
|
|||||||
in.Logging.DeepCopyInto(&out.Logging)
|
in.Logging.DeepCopyInto(&out.Logging)
|
||||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||||
|
if in.ShutdownGracePeriodByPodPriority != nil {
|
||||||
|
in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority
|
||||||
|
*out = make([]ShutdownGracePeriodByPodPriority, len(*in))
|
||||||
|
copy(*out, *in)
|
||||||
|
}
|
||||||
if in.ReservedMemory != nil {
|
if in.ReservedMemory != nil {
|
||||||
in, out := &in.ReservedMemory, &out.ReservedMemory
|
in, out := &in.ReservedMemory, &out.ReservedMemory
|
||||||
*out = make([]MemoryReservation, len(*in))
|
*out = make([]MemoryReservation, len(*in))
|
||||||
@ -438,3 +443,19 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||||
|
func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) {
|
||||||
|
*out = *in
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority.
|
||||||
|
func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority {
|
||||||
|
if in == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := new(ShutdownGracePeriodByPodPriority)
|
||||||
|
in.DeepCopyInto(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
@ -867,14 +867,15 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
|||||||
|
|
||||||
// setup node shutdown manager
|
// setup node shutdown manager
|
||||||
shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
|
shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
|
||||||
ProbeManager: klet.probeManager,
|
ProbeManager: klet.probeManager,
|
||||||
Recorder: kubeDeps.Recorder,
|
Recorder: kubeDeps.Recorder,
|
||||||
NodeRef: nodeRef,
|
NodeRef: nodeRef,
|
||||||
GetPodsFunc: klet.GetActivePods,
|
GetPodsFunc: klet.GetActivePods,
|
||||||
KillPodFunc: killPodNow(klet.podWorkers, kubeDeps.Recorder),
|
KillPodFunc: killPodNow(klet.podWorkers, kubeDeps.Recorder),
|
||||||
SyncNodeStatusFunc: klet.syncNodeStatus,
|
SyncNodeStatusFunc: klet.syncNodeStatus,
|
||||||
ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration,
|
ShutdownGracePeriodRequested: kubeCfg.ShutdownGracePeriod.Duration,
|
||||||
ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
|
ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority,
|
||||||
})
|
})
|
||||||
klet.shutdownManager = shutdownManager
|
klet.shutdownManager = shutdownManager
|
||||||
klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
|
klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
|
||||||
|
@ -21,6 +21,7 @@ import (
|
|||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/client-go/tools/record"
|
"k8s.io/client-go/tools/record"
|
||||||
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/prober"
|
"k8s.io/kubernetes/pkg/kubelet/prober"
|
||||||
@ -36,15 +37,16 @@ type Manager interface {
|
|||||||
|
|
||||||
// Config represents Manager configuration
|
// Config represents Manager configuration
|
||||||
type Config struct {
|
type Config struct {
|
||||||
ProbeManager prober.Manager
|
ProbeManager prober.Manager
|
||||||
Recorder record.EventRecorder
|
Recorder record.EventRecorder
|
||||||
NodeRef *v1.ObjectReference
|
NodeRef *v1.ObjectReference
|
||||||
GetPodsFunc eviction.ActivePodsFunc
|
GetPodsFunc eviction.ActivePodsFunc
|
||||||
KillPodFunc eviction.KillPodFunc
|
KillPodFunc eviction.KillPodFunc
|
||||||
SyncNodeStatusFunc func()
|
SyncNodeStatusFunc func()
|
||||||
ShutdownGracePeriodRequested time.Duration
|
ShutdownGracePeriodRequested time.Duration
|
||||||
ShutdownGracePeriodCriticalPods time.Duration
|
ShutdownGracePeriodCriticalPods time.Duration
|
||||||
Clock clock.Clock
|
ShutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
Clock clock.Clock
|
||||||
}
|
}
|
||||||
|
|
||||||
// managerStub is a fake node shutdown managerImpl .
|
// managerStub is a fake node shutdown managerImpl .
|
||||||
|
@ -22,6 +22,7 @@ package nodeshutdown
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -29,13 +30,14 @@ import (
|
|||||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||||
"k8s.io/client-go/tools/record"
|
"k8s.io/client-go/tools/record"
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
|
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||||
"k8s.io/kubernetes/pkg/features"
|
"k8s.io/kubernetes/pkg/features"
|
||||||
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
|
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/prober"
|
"k8s.io/kubernetes/pkg/kubelet/prober"
|
||||||
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
|
||||||
"k8s.io/utils/clock"
|
"k8s.io/utils/clock"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -66,8 +68,7 @@ type managerImpl struct {
|
|||||||
nodeRef *v1.ObjectReference
|
nodeRef *v1.ObjectReference
|
||||||
probeManager prober.Manager
|
probeManager prober.Manager
|
||||||
|
|
||||||
shutdownGracePeriodRequested time.Duration
|
shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
shutdownGracePeriodCriticalPods time.Duration
|
|
||||||
|
|
||||||
getPods eviction.ActivePodsFunc
|
getPods eviction.ActivePodsFunc
|
||||||
killPodFunc eviction.KillPodFunc
|
killPodFunc eviction.KillPodFunc
|
||||||
@ -84,28 +85,46 @@ type managerImpl struct {
|
|||||||
|
|
||||||
// NewManager returns a new node shutdown manager.
|
// NewManager returns a new node shutdown manager.
|
||||||
func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
|
func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
|
||||||
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) ||
|
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||||
(conf.ShutdownGracePeriodRequested == 0 && conf.ShutdownGracePeriodCriticalPods == 0) {
|
|
||||||
m := managerStub{}
|
m := managerStub{}
|
||||||
return m, m
|
return m, m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
shutdownGracePeriodByPodPriority := conf.ShutdownGracePeriodByPodPriority
|
||||||
|
// Migration from the original configuration
|
||||||
|
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) ||
|
||||||
|
len(shutdownGracePeriodByPodPriority) == 0 {
|
||||||
|
shutdownGracePeriodByPodPriority = migrateConfig(conf.ShutdownGracePeriodRequested, conf.ShutdownGracePeriodCriticalPods)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disable if the configuration is empty
|
||||||
|
if len(shutdownGracePeriodByPodPriority) == 0 {
|
||||||
|
m := managerStub{}
|
||||||
|
return m, m
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by priority from low to high
|
||||||
|
sort.Slice(shutdownGracePeriodByPodPriority, func(i, j int) bool {
|
||||||
|
return shutdownGracePeriodByPodPriority[i].Priority < shutdownGracePeriodByPodPriority[j].Priority
|
||||||
|
})
|
||||||
|
|
||||||
if conf.Clock == nil {
|
if conf.Clock == nil {
|
||||||
conf.Clock = clock.RealClock{}
|
conf.Clock = clock.RealClock{}
|
||||||
}
|
}
|
||||||
manager := &managerImpl{
|
manager := &managerImpl{
|
||||||
probeManager: conf.ProbeManager,
|
probeManager: conf.ProbeManager,
|
||||||
recorder: conf.Recorder,
|
recorder: conf.Recorder,
|
||||||
nodeRef: conf.NodeRef,
|
nodeRef: conf.NodeRef,
|
||||||
getPods: conf.GetPodsFunc,
|
getPods: conf.GetPodsFunc,
|
||||||
killPodFunc: conf.KillPodFunc,
|
killPodFunc: conf.KillPodFunc,
|
||||||
syncNodeStatus: conf.SyncNodeStatusFunc,
|
syncNodeStatus: conf.SyncNodeStatusFunc,
|
||||||
shutdownGracePeriodRequested: conf.ShutdownGracePeriodRequested,
|
shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority,
|
||||||
shutdownGracePeriodCriticalPods: conf.ShutdownGracePeriodCriticalPods,
|
clock: conf.Clock,
|
||||||
clock: conf.Clock,
|
|
||||||
}
|
}
|
||||||
klog.InfoS("Creating node shutdown manager",
|
klog.InfoS("Creating node shutdown manager",
|
||||||
"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
|
"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
|
||||||
"shutdownGracePeriodCriticalPods", conf.ShutdownGracePeriodCriticalPods,
|
"shutdownGracePeriodCriticalPods", conf.ShutdownGracePeriodCriticalPods,
|
||||||
|
"shutdownGracePeriodByPodPriority", shutdownGracePeriodByPodPriority,
|
||||||
)
|
)
|
||||||
return manager, manager
|
return manager, manager
|
||||||
}
|
}
|
||||||
@ -159,9 +178,9 @@ func (m *managerImpl) start() (chan struct{}, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than shutdownGracePeriodRequested, attempt to update the value to shutdownGracePeriodRequested.
|
// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than periodRequested, attempt to update the value to periodRequested.
|
||||||
if m.shutdownGracePeriodRequested > currentInhibitDelay {
|
if periodRequested := m.periodRequested(); periodRequested > currentInhibitDelay {
|
||||||
err := m.dbusCon.OverrideInhibitDelay(m.shutdownGracePeriodRequested)
|
err := m.dbusCon.OverrideInhibitDelay(periodRequested)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err)
|
return nil, fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err)
|
||||||
}
|
}
|
||||||
@ -177,8 +196,8 @@ func (m *managerImpl) start() (chan struct{}, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if m.shutdownGracePeriodRequested > updatedInhibitDelay {
|
if periodRequested > updatedInhibitDelay {
|
||||||
return nil, fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", m.shutdownGracePeriodRequested, updatedInhibitDelay)
|
return nil, fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", periodRequested, updatedInhibitDelay)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,54 +289,54 @@ func (m *managerImpl) processShutdownEvent() error {
|
|||||||
klog.V(1).InfoS("Shutdown manager processing shutdown event")
|
klog.V(1).InfoS("Shutdown manager processing shutdown event")
|
||||||
activePods := m.getPods()
|
activePods := m.getPods()
|
||||||
|
|
||||||
nonCriticalPodGracePeriod := m.shutdownGracePeriodRequested - m.shutdownGracePeriodCriticalPods
|
groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods)
|
||||||
|
for _, group := range groups {
|
||||||
|
// If there are no pods in a particular range,
|
||||||
|
// then do not wait for pods in that priority range.
|
||||||
|
if len(group.Pods) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
wg.Add(len(activePods))
|
wg.Add(len(group.Pods))
|
||||||
for _, pod := range activePods {
|
for _, pod := range group.Pods {
|
||||||
go func(pod *v1.Pod) {
|
go func(pod *v1.Pod, group podShutdownGroup) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
var gracePeriodOverride int64
|
gracePeriodOverride := group.ShutdownGracePeriodSeconds
|
||||||
if kubelettypes.IsCriticalPod(pod) {
|
|
||||||
gracePeriodOverride = int64(m.shutdownGracePeriodCriticalPods.Seconds())
|
|
||||||
m.clock.Sleep(nonCriticalPodGracePeriod)
|
|
||||||
} else {
|
|
||||||
gracePeriodOverride = int64(nonCriticalPodGracePeriod.Seconds())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stop probes for the pod
|
// Stop probes for the pod
|
||||||
m.probeManager.RemovePod(pod)
|
m.probeManager.RemovePod(pod)
|
||||||
|
|
||||||
// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
|
// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
|
||||||
if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
|
if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
|
||||||
gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
|
gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
|
||||||
}
|
}
|
||||||
|
|
||||||
klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride)
|
klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride)
|
||||||
if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) {
|
|
||||||
status.Message = nodeShutdownMessage
|
|
||||||
status.Reason = nodeShutdownReason
|
|
||||||
}); err != nil {
|
|
||||||
klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err)
|
|
||||||
} else {
|
|
||||||
klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod))
|
|
||||||
}
|
|
||||||
}(pod)
|
|
||||||
}
|
|
||||||
|
|
||||||
c := make(chan struct{})
|
if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) {
|
||||||
go func() {
|
status.Message = nodeShutdownMessage
|
||||||
defer close(c)
|
status.Reason = nodeShutdownReason
|
||||||
wg.Wait()
|
}); err != nil {
|
||||||
}()
|
klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err)
|
||||||
|
} else {
|
||||||
|
klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod))
|
||||||
|
}
|
||||||
|
}(pod, group)
|
||||||
|
}
|
||||||
|
|
||||||
// We want to ensure that inhibitLock is released, so only wait up to the shutdownGracePeriodRequested timeout.
|
c := make(chan struct{})
|
||||||
select {
|
go func() {
|
||||||
case <-c:
|
defer close(c)
|
||||||
break
|
wg.Wait()
|
||||||
case <-time.After(m.shutdownGracePeriodRequested):
|
}()
|
||||||
klog.V(1).InfoS("Shutdown manager pod killing time out", "gracePeriod", m.shutdownGracePeriodRequested)
|
|
||||||
|
select {
|
||||||
|
case <-c:
|
||||||
|
case <-time.After(time.Duration(group.ShutdownGracePeriodSeconds) * time.Second):
|
||||||
|
klog.V(1).InfoS("Shutdown manager pod killing time out", "gracePeriod", group.ShutdownGracePeriodSeconds, "priority", group.Priority)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
|
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
|
||||||
@ -325,3 +344,78 @@ func (m *managerImpl) processShutdownEvent() error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *managerImpl) periodRequested() time.Duration {
|
||||||
|
var sum int64
|
||||||
|
for _, period := range m.shutdownGracePeriodByPodPriority {
|
||||||
|
sum += period.ShutdownGracePeriodSeconds
|
||||||
|
}
|
||||||
|
return time.Duration(sum) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
func migrateConfig(shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) []kubeletconfig.ShutdownGracePeriodByPodPriority {
|
||||||
|
if shutdownGracePeriodRequested == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
defaultPriority := shutdownGracePeriodRequested - shutdownGracePeriodCriticalPods
|
||||||
|
if defaultPriority < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
criticalPriority := shutdownGracePeriodRequested - defaultPriority
|
||||||
|
if criticalPriority < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: int64(defaultPriority / time.Second),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: int64(criticalPriority / time.Second),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func groupByPriority(shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority, pods []*v1.Pod) []podShutdownGroup {
|
||||||
|
groups := make([]podShutdownGroup, 0, len(shutdownGracePeriodByPodPriority))
|
||||||
|
for _, period := range shutdownGracePeriodByPodPriority {
|
||||||
|
groups = append(groups, podShutdownGroup{
|
||||||
|
ShutdownGracePeriodByPodPriority: period,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, pod := range pods {
|
||||||
|
var priority int32
|
||||||
|
if pod.Spec.Priority != nil {
|
||||||
|
priority = *pod.Spec.Priority
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the group index according to the priority.
|
||||||
|
index := sort.Search(len(groups), func(i int) bool {
|
||||||
|
return groups[i].Priority >= priority
|
||||||
|
})
|
||||||
|
|
||||||
|
// 1. Those higher than the highest priority default to the highest priority
|
||||||
|
// 2. Those lower than the lowest priority default to the lowest priority
|
||||||
|
// 3. Those boundary priority default to the lower priority
|
||||||
|
// if priority of pod is:
|
||||||
|
// groups[index-1].Priority <= pod priority < groups[index].Priority
|
||||||
|
// in which case we want to pick lower one (i.e index-1)
|
||||||
|
if index == len(groups) {
|
||||||
|
index = len(groups) - 1
|
||||||
|
} else if index < 0 {
|
||||||
|
index = 0
|
||||||
|
} else if index > 0 && groups[index].Priority > priority {
|
||||||
|
index--
|
||||||
|
}
|
||||||
|
|
||||||
|
groups[index].Pods = append(groups[index].Pods, pod)
|
||||||
|
}
|
||||||
|
return groups
|
||||||
|
}
|
||||||
|
|
||||||
|
type podShutdownGroup struct {
|
||||||
|
kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
Pods []*v1.Pod
|
||||||
|
}
|
||||||
|
@ -35,6 +35,7 @@ import (
|
|||||||
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||||
"k8s.io/kubernetes/pkg/apis/scheduling"
|
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||||
pkgfeatures "k8s.io/kubernetes/pkg/features"
|
pkgfeatures "k8s.io/kubernetes/pkg/features"
|
||||||
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
||||||
probetest "k8s.io/kubernetes/pkg/kubelet/prober/testing"
|
probetest "k8s.io/kubernetes/pkg/kubelet/prober/testing"
|
||||||
testingclock "k8s.io/utils/clock/testing"
|
testingclock "k8s.io/utils/clock/testing"
|
||||||
@ -81,12 +82,7 @@ func (f *fakeDbus) OverrideInhibitDelay(inhibitDelayMax time.Duration) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func makePod(name string, criticalPod bool, terminationGracePeriod *int64) *v1.Pod {
|
func makePod(name string, priority int32, terminationGracePeriod *int64) *v1.Pod {
|
||||||
var priority int32
|
|
||||||
if criticalPod {
|
|
||||||
priority = scheduling.SystemCriticalPriority
|
|
||||||
}
|
|
||||||
|
|
||||||
return &v1.Pod{
|
return &v1.Pod{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
Name: name,
|
Name: name,
|
||||||
@ -104,15 +100,15 @@ func TestManager(t *testing.T) {
|
|||||||
defer func() {
|
defer func() {
|
||||||
systemDbus = systemDbusTmp
|
systemDbus = systemDbusTmp
|
||||||
}()
|
}()
|
||||||
normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", false /* criticalPod */, nil /* terminationGracePeriod */)
|
normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil /* terminationGracePeriod */)
|
||||||
criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", true /* criticalPod */, nil /* terminationGracePeriod */)
|
criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", scheduling.SystemCriticalPriority, nil /* terminationGracePeriod */)
|
||||||
|
|
||||||
shortGracePeriod := int64(2)
|
shortGracePeriod := int64(2)
|
||||||
normalPodGracePeriod := makePod("normal-pod-grace-period", false /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */)
|
normalPodGracePeriod := makePod("normal-pod-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, &shortGracePeriod /* terminationGracePeriod */)
|
||||||
criticalPodGracePeriod := makePod("critical-pod-grace-period", true /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */)
|
criticalPodGracePeriod := makePod("critical-pod-grace-period", scheduling.SystemCriticalPriority, &shortGracePeriod /* terminationGracePeriod */)
|
||||||
|
|
||||||
longGracePeriod := int64(1000)
|
longGracePeriod := int64(1000)
|
||||||
normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", false /* criticalPod */, &longGracePeriod /* terminationGracePeriod */)
|
normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, &longGracePeriod /* terminationGracePeriod */)
|
||||||
|
|
||||||
var tests = []struct {
|
var tests = []struct {
|
||||||
desc string
|
desc string
|
||||||
@ -256,7 +252,9 @@ func TestManager(t *testing.T) {
|
|||||||
lock.Unlock()
|
lock.Unlock()
|
||||||
|
|
||||||
if tc.expectedError != nil {
|
if tc.expectedError != nil {
|
||||||
if !strings.Contains(err.Error(), tc.expectedError.Error()) {
|
if err == nil {
|
||||||
|
t.Errorf("unexpected error message. Got: <nil> want %s", tc.expectedError.Error())
|
||||||
|
} else if !strings.Contains(err.Error(), tc.expectedError.Error()) {
|
||||||
t.Errorf("unexpected error message. Got: %s want %s", err.Error(), tc.expectedError.Error())
|
t.Errorf("unexpected error message. Got: %s want %s", err.Error(), tc.expectedError.Error())
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -266,7 +264,11 @@ func TestManager(t *testing.T) {
|
|||||||
assert.Equal(t, manager.Admit(nil).Admit, true)
|
assert.Equal(t, manager.Admit(nil).Admit, true)
|
||||||
|
|
||||||
// Send fake shutdown event
|
// Send fake shutdown event
|
||||||
fakeShutdownChan <- true
|
select {
|
||||||
|
case fakeShutdownChan <- true:
|
||||||
|
case <-time.After(1 * time.Second):
|
||||||
|
t.Fatal()
|
||||||
|
}
|
||||||
|
|
||||||
// Wait for all the pods to be killed
|
// Wait for all the pods to be killed
|
||||||
killedPodsToGracePeriods := map[string]int64{}
|
killedPodsToGracePeriods := map[string]int64{}
|
||||||
@ -413,3 +415,196 @@ func TestRestart(t *testing.T) {
|
|||||||
shutdownChanMut.Unlock()
|
shutdownChanMut.Unlock()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Test_migrateConfig(t *testing.T) {
|
||||||
|
type shutdownConfig struct {
|
||||||
|
shutdownGracePeriodRequested time.Duration
|
||||||
|
shutdownGracePeriodCriticalPods time.Duration
|
||||||
|
}
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
args shutdownConfig
|
||||||
|
want []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "both shutdownGracePeriodRequested and shutdownGracePeriodCriticalPods",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 300 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 120 * time.Second,
|
||||||
|
},
|
||||||
|
want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: 180,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: 120,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "only shutdownGracePeriodRequested",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 100 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 0 * time.Second,
|
||||||
|
},
|
||||||
|
want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty configuration",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 0 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 0 * time.Second,
|
||||||
|
},
|
||||||
|
want: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "wrong configuration",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 1 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 100 * time.Second,
|
||||||
|
},
|
||||||
|
want: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := migrateConfig(tt.args.shutdownGracePeriodRequested, tt.args.shutdownGracePeriodCriticalPods); !assert.Equal(t, tt.want, got) {
|
||||||
|
t.Errorf("migrateConfig() = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_groupByPriority(t *testing.T) {
|
||||||
|
type args struct {
|
||||||
|
shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
pods []*v1.Pod
|
||||||
|
}
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
args args
|
||||||
|
want []podShutdownGroup
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "migrate config",
|
||||||
|
args: args{
|
||||||
|
shutdownGracePeriodByPodPriority: migrateConfig(300*time.Second /* shutdownGracePeriodRequested */, 120*time.Second /* shutdownGracePeriodCriticalPods */),
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
|
||||||
|
makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
|
||||||
|
makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
want: []podShutdownGroup{
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: 180,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
|
||||||
|
makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: 120,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "pod priority",
|
||||||
|
args: args{
|
||||||
|
shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: 1,
|
||||||
|
ShutdownGracePeriodSeconds: 10,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: 2,
|
||||||
|
ShutdownGracePeriodSeconds: 20,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: 3,
|
||||||
|
ShutdownGracePeriodSeconds: 30,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: 4,
|
||||||
|
ShutdownGracePeriodSeconds: 40,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
makePod("pod-0", 0, nil),
|
||||||
|
makePod("pod-1", 1, nil),
|
||||||
|
makePod("pod-2", 2, nil),
|
||||||
|
makePod("pod-3", 3, nil),
|
||||||
|
makePod("pod-4", 4, nil),
|
||||||
|
makePod("pod-5", 5, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
want: []podShutdownGroup{
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 1,
|
||||||
|
ShutdownGracePeriodSeconds: 10,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-0", 0, nil),
|
||||||
|
makePod("pod-1", 1, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 2,
|
||||||
|
ShutdownGracePeriodSeconds: 20,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-2", 2, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 3,
|
||||||
|
ShutdownGracePeriodSeconds: 30,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-3", 3, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 4,
|
||||||
|
ShutdownGracePeriodSeconds: 40,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-4", 4, nil),
|
||||||
|
makePod("pod-5", 5, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := groupByPriority(tt.args.shutdownGracePeriodByPodPriority, tt.args.pods); !assert.Equal(t, tt.want, got) {
|
||||||
|
t.Errorf("groupByPriority() = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -989,6 +989,35 @@ type KubeletConfiguration struct {
|
|||||||
// +featureGate=GracefulNodeShutdown
|
// +featureGate=GracefulNodeShutdown
|
||||||
// +optional
|
// +optional
|
||||||
ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"`
|
ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"`
|
||||||
|
// shutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based
|
||||||
|
// on their associated priority class value.
|
||||||
|
// When a shutdown request is received, the Kubelet will initiate shutdown on all pods
|
||||||
|
// running on the node with a grace period that depends on the priority of the pod,
|
||||||
|
// and then wait for all pods to exit.
|
||||||
|
// Each entry in the array represents the graceful shutdown time a pod with a priority
|
||||||
|
// class value that lies in the range of that value and the next higher entry in the
|
||||||
|
// list when the node is shutting down.
|
||||||
|
// For example, to allow critical pods 10s to shutdown, priority>=10000 pods 20s to
|
||||||
|
// shutdown, and all remaining pods 30s to shutdown.
|
||||||
|
//
|
||||||
|
// shutdownGracePeriodByPodPriority:
|
||||||
|
// - priority: 2000000000
|
||||||
|
// shutdownGracePeriodSeconds: 10
|
||||||
|
// - priority: 10000
|
||||||
|
// shutdownGracePeriodSeconds: 20
|
||||||
|
// - priority: 0
|
||||||
|
// shutdownGracePeriodSeconds: 30
|
||||||
|
//
|
||||||
|
// The time the Kubelet will wait before exiting will at most be the maximum of all
|
||||||
|
// shutdownGracePeriodSeconds for each priority class range represented on the node.
|
||||||
|
// When all pods have exited or reached their grace periods, the Kubelet will release
|
||||||
|
// the shutdown inhibit lock.
|
||||||
|
// Requires the GracefulNodeShutdown feature gate to be enabled.
|
||||||
|
// This configuration must be empty if either ShutdownGracePeriod or ShutdownGracePeriodCriticalPods is set.
|
||||||
|
// Default: nil
|
||||||
|
// +featureGate=GracefulNodeShutdownBasedOnPodPriority
|
||||||
|
// +optional
|
||||||
|
ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority `json:"shutdownGracePeriodByPodPriority,omitempty"`
|
||||||
// reservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
|
// reservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
|
||||||
// The parameter makes sense only in the context of the memory manager feature.
|
// The parameter makes sense only in the context of the memory manager feature.
|
||||||
// The memory manager will not allocate reserved memory for container workloads.
|
// The memory manager will not allocate reserved memory for container workloads.
|
||||||
@ -1136,6 +1165,14 @@ type MemoryReservation struct {
|
|||||||
Limits v1.ResourceList `json:"limits"`
|
Limits v1.ResourceList `json:"limits"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value
|
||||||
|
type ShutdownGracePeriodByPodPriority struct {
|
||||||
|
// priority is the priority value associated with the shutdown grace period
|
||||||
|
Priority int32 `json:"priority"`
|
||||||
|
// shutdownGracePeriodSeconds is the shutdown grace period in seconds
|
||||||
|
ShutdownGracePeriodSeconds int64 `json:"shutdownGracePeriodSeconds"`
|
||||||
|
}
|
||||||
|
|
||||||
type MemorySwapConfiguration struct {
|
type MemorySwapConfiguration struct {
|
||||||
// swapBehavior configures swap memory available to container workloads. May be one of
|
// swapBehavior configures swap memory available to container workloads. May be one of
|
||||||
// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
|
// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
|
||||||
|
@ -318,6 +318,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
|
|||||||
}
|
}
|
||||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||||
|
if in.ShutdownGracePeriodByPodPriority != nil {
|
||||||
|
in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority
|
||||||
|
*out = make([]ShutdownGracePeriodByPodPriority, len(*in))
|
||||||
|
copy(*out, *in)
|
||||||
|
}
|
||||||
if in.ReservedMemory != nil {
|
if in.ReservedMemory != nil {
|
||||||
in, out := &in.ReservedMemory, &out.ReservedMemory
|
in, out := &in.ReservedMemory, &out.ReservedMemory
|
||||||
*out = make([]MemoryReservation, len(*in))
|
*out = make([]MemoryReservation, len(*in))
|
||||||
@ -498,3 +503,19 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
|
||||||
|
func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) {
|
||||||
|
*out = *in
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority.
|
||||||
|
func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority {
|
||||||
|
if in == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := new(ShutdownGracePeriodByPodPriority)
|
||||||
|
in.DeepCopyInto(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
@ -35,13 +35,15 @@ import (
|
|||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
|
schedulingv1 "k8s.io/api/scheduling/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/kubernetes/pkg/features"
|
||||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||||
testutils "k8s.io/kubernetes/test/utils"
|
testutils "k8s.io/kubernetes/test/utils"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown]", func() {
|
var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown] [NodeFeature:GracefulNodeShutdownBasedOnPodPriority]", func() {
|
||||||
f := framework.NewDefaultFramework("graceful-node-shutdown")
|
f := framework.NewDefaultFramework("graceful-node-shutdown")
|
||||||
ginkgo.Context("when gracefully shutting down", func() {
|
ginkgo.Context("when gracefully shutting down", func() {
|
||||||
|
|
||||||
@ -54,6 +56,10 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
|
|||||||
)
|
)
|
||||||
|
|
||||||
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||||
|
initialConfig.FeatureGates = map[string]bool{
|
||||||
|
string(features.GracefulNodeShutdown): true,
|
||||||
|
string(features.GracefulNodeShutdownBasedOnPodPriority): false,
|
||||||
|
}
|
||||||
initialConfig.ShutdownGracePeriod = metav1.Duration{Duration: nodeShutdownGracePeriod}
|
initialConfig.ShutdownGracePeriod = metav1.Duration{Duration: nodeShutdownGracePeriod}
|
||||||
initialConfig.ShutdownGracePeriodCriticalPods = metav1.Duration{Duration: nodeShutdownGracePeriodCriticalPods}
|
initialConfig.ShutdownGracePeriodCriticalPods = metav1.Duration{Duration: nodeShutdownGracePeriodCriticalPods}
|
||||||
})
|
})
|
||||||
@ -77,10 +83,10 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
|
|||||||
|
|
||||||
// Define test pods
|
// Define test pods
|
||||||
pods := []*v1.Pod{
|
pods := []*v1.Pod{
|
||||||
getGracePeriodOverrideTestPod("period-120", nodeName, 120, false),
|
getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""),
|
||||||
getGracePeriodOverrideTestPod("period-5", nodeName, 5, false),
|
getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""),
|
||||||
getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, true),
|
getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical),
|
||||||
getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, true),
|
getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical),
|
||||||
}
|
}
|
||||||
|
|
||||||
ginkgo.By("Creating batch pods")
|
ginkgo.By("Creating batch pods")
|
||||||
@ -117,12 +123,12 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
|
|||||||
for _, pod := range list.Items {
|
for _, pod := range list.Items {
|
||||||
if kubelettypes.IsCriticalPod(&pod) {
|
if kubelettypes.IsCriticalPod(&pod) {
|
||||||
if isPodShutdown(&pod) {
|
if isPodShutdown(&pod) {
|
||||||
framework.Logf("Expecting critcal pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
|
framework.Logf("Expecting critical pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
|
||||||
return fmt.Errorf("critical pod should not be shutdown, phase: %s", pod.Status.Phase)
|
return fmt.Errorf("critical pod should not be shutdown, phase: %s", pod.Status.Phase)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if !isPodShutdown(&pod) {
|
if !isPodShutdown(&pod) {
|
||||||
framework.Logf("Expecting non-critcal pod to be shutdown, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
|
framework.Logf("Expecting non-critical pod to be shutdown, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
|
||||||
return fmt.Errorf("pod should be shutdown, phase: %s", pod.Status.Phase)
|
return fmt.Errorf("pod should be shutdown, phase: %s", pod.Status.Phase)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -207,9 +213,185 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
|
|||||||
}, nodeStatusUpdateTimeout, pollInterval).Should(gomega.BeNil())
|
}, nodeStatusUpdateTimeout, pollInterval).Should(gomega.BeNil())
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
ginkgo.Context("when gracefully shutting down with Pod priority", func() {
|
||||||
|
|
||||||
|
const (
|
||||||
|
pollInterval = 1 * time.Second
|
||||||
|
podStatusUpdateTimeout = 10 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
customClassA = getPriorityClass("custom-class-a", 100000)
|
||||||
|
customClassB = getPriorityClass("custom-class-b", 10000)
|
||||||
|
customClassC = getPriorityClass("custom-class-c", 1000)
|
||||||
|
)
|
||||||
|
|
||||||
|
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||||
|
initialConfig.FeatureGates = map[string]bool{
|
||||||
|
string(features.GracefulNodeShutdown): true,
|
||||||
|
string(features.GracefulNodeShutdownBasedOnPodPriority): true,
|
||||||
|
}
|
||||||
|
initialConfig.ShutdownGracePeriodByPodPriority = []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: customClassA.Value,
|
||||||
|
ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: customClassB.Value,
|
||||||
|
ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: customClassC.Value,
|
||||||
|
ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.BeforeEach(func() {
|
||||||
|
ginkgo.By("Wait for the node to be ready")
|
||||||
|
waitForNodeReady()
|
||||||
|
|
||||||
|
for _, customClass := range []*schedulingv1.PriorityClass{customClassA, customClassB, customClassC} {
|
||||||
|
_, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(context.Background(), customClass, metav1.CreateOptions{})
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.AfterEach(func() {
|
||||||
|
ginkgo.By("Emitting Shutdown false signal; cancelling the shutdown")
|
||||||
|
err := emitSignalPrepareForShutdown(false)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
})
|
||||||
|
|
||||||
|
ginkgo.It("should be able to gracefully shutdown pods with various grace periods", func() {
|
||||||
|
nodeName := getNodeName(f)
|
||||||
|
nodeSelector := fields.Set{
|
||||||
|
"spec.nodeName": nodeName,
|
||||||
|
}.AsSelector().String()
|
||||||
|
|
||||||
|
// Define test pods
|
||||||
|
pods := []*v1.Pod{
|
||||||
|
getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""),
|
||||||
|
getGracePeriodOverrideTestPod("period-c-5", nodeName, 5, customClassC.Name),
|
||||||
|
getGracePeriodOverrideTestPod("period-b-5", nodeName, 5, customClassB.Name),
|
||||||
|
getGracePeriodOverrideTestPod("period-a-5", nodeName, 5, customClassA.Name),
|
||||||
|
getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Expected down steps
|
||||||
|
downSteps := [][]string{
|
||||||
|
{
|
||||||
|
"period-5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"period-5",
|
||||||
|
"period-c-5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"period-5",
|
||||||
|
"period-c-5",
|
||||||
|
"period-b-5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"period-5",
|
||||||
|
"period-c-5",
|
||||||
|
"period-b-5",
|
||||||
|
"period-a-5",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"period-5",
|
||||||
|
"period-c-5",
|
||||||
|
"period-b-5",
|
||||||
|
"period-a-5",
|
||||||
|
"period-critical-5",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
ginkgo.By("Creating batch pods")
|
||||||
|
f.PodClient().CreateBatch(pods)
|
||||||
|
|
||||||
|
list, err := f.PodClient().List(context.TODO(), metav1.ListOptions{
|
||||||
|
FieldSelector: nodeSelector,
|
||||||
|
})
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
|
||||||
|
|
||||||
|
ginkgo.By("Verifying batch pods are running")
|
||||||
|
for _, pod := range list.Items {
|
||||||
|
if podReady, err := testutils.PodRunningReady(&pod); err != nil || !podReady {
|
||||||
|
framework.Failf("Failed to start batch pod: %v", pod.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ginkgo.By("Emitting shutdown signal")
|
||||||
|
err = emitSignalPrepareForShutdown(true)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
ginkgo.By("Verifying that pods are shutdown")
|
||||||
|
|
||||||
|
for _, step := range downSteps {
|
||||||
|
gomega.Eventually(func() error {
|
||||||
|
list, err = f.PodClient().List(context.TODO(), metav1.ListOptions{
|
||||||
|
FieldSelector: nodeSelector,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
|
||||||
|
for _, pod := range list.Items {
|
||||||
|
shouldShutdown := false
|
||||||
|
for _, podName := range step {
|
||||||
|
if podName == pod.Name {
|
||||||
|
shouldShutdown = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !shouldShutdown {
|
||||||
|
if pod.Status.Phase != v1.PodRunning {
|
||||||
|
framework.Logf("Expecting pod to be running, but it's not currently. Pod: %q, Pod Status Phase: %q, Pod Status Reason: %q", pod.Name, pod.Status.Phase, pod.Status.Reason)
|
||||||
|
return fmt.Errorf("pod should not be shutdown, phase: %s, reason: %s", pod.Status.Phase, pod.Status.Reason)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if pod.Status.Reason != podShutdownReason {
|
||||||
|
framework.Logf("Expecting pod to be shutdown, but it's not currently. Pod: %q, Pod Status Phase: %q, Pod Status Reason: %q", pod.Name, pod.Status.Phase, pod.Status.Reason)
|
||||||
|
for _, item := range list.Items {
|
||||||
|
framework.Logf("DEBUG %s, %s, %s", item.Name, item.Status.Phase, pod.Status.Reason)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("pod should be shutdown, reason: %s", pod.Status.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}, podStatusUpdateTimeout, pollInterval).Should(gomega.BeNil())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
func getGracePeriodOverrideTestPod(name string, node string, gracePeriod int64, critical bool) *v1.Pod {
|
func getPriorityClass(name string, value int32) *schedulingv1.PriorityClass {
|
||||||
|
priority := &schedulingv1.PriorityClass{
|
||||||
|
TypeMeta: metav1.TypeMeta{
|
||||||
|
Kind: "PriorityClass",
|
||||||
|
APIVersion: "scheduling.k8s.io/v1",
|
||||||
|
},
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: name,
|
||||||
|
},
|
||||||
|
Value: value,
|
||||||
|
}
|
||||||
|
return priority
|
||||||
|
}
|
||||||
|
func getGracePeriodOverrideTestPod(name string, node string, gracePeriod int64, priorityClassName string) *v1.Pod {
|
||||||
pod := &v1.Pod{
|
pod := &v1.Pod{
|
||||||
TypeMeta: metav1.TypeMeta{
|
TypeMeta: metav1.TypeMeta{
|
||||||
Kind: "Pod",
|
Kind: "Pod",
|
||||||
@ -238,14 +420,14 @@ while true; do sleep 5; done
|
|||||||
NodeName: node,
|
NodeName: node,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if critical {
|
if priorityClassName == scheduling.SystemNodeCritical {
|
||||||
pod.ObjectMeta.Annotations = map[string]string{
|
pod.ObjectMeta.Annotations = map[string]string{
|
||||||
kubelettypes.ConfigSourceAnnotationKey: kubelettypes.FileSource,
|
kubelettypes.ConfigSourceAnnotationKey: kubelettypes.FileSource,
|
||||||
}
|
}
|
||||||
pod.Spec.PriorityClassName = scheduling.SystemNodeCritical
|
pod.Spec.PriorityClassName = priorityClassName
|
||||||
|
|
||||||
framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), true, "pod should be a critical pod")
|
framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), true, "pod should be a critical pod")
|
||||||
} else {
|
} else {
|
||||||
|
pod.Spec.PriorityClassName = priorityClassName
|
||||||
framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), false, "pod should not be a critical pod")
|
framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), false, "pod should not be a critical pod")
|
||||||
}
|
}
|
||||||
return pod
|
return pod
|
||||||
|
Loading…
Reference in New Issue
Block a user