From d82f6069708ff25ecff48464bee82bc570b55641 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@foxmail.com>
Date: Wed, 17 Nov 2021 11:46:51 +0800
Subject: [PATCH 1/5] Add field for KubeletConfiguration and Regenerate

---
 api/api-rules/violation_exceptions.list       |  1 +
 pkg/features/kube_features.go                 |  7 ++++
 pkg/kubelet/apis/config/types.go              | 17 +++++++++
 .../config/v1beta1/zz_generated.conversion.go | 34 +++++++++++++++++
 .../apis/config/zz_generated.deepcopy.go      | 21 +++++++++++
 .../k8s.io/kubelet/config/v1beta1/types.go    | 37 +++++++++++++++++++
 .../config/v1beta1/zz_generated.deepcopy.go   | 21 +++++++++++
 7 files changed, 138 insertions(+)

diff --git a/api/api-rules/violation_exceptions.list b/api/api-rules/violation_exceptions.list
index d8ea748b82e..b1279ab3e0a 100644
--- a/api/api-rules/violation_exceptions.list
+++ b/api/api-rules/violation_exceptions.list
@@ -381,6 +381,7 @@ API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfi
 API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,EnforceNodeAllocatable
 API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,RegisterWithTaints
 API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ReservedMemory
+API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ShutdownGracePeriodByPodPriority
 API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,TLSCipherSuites
 API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1alpha1,PodMetrics,Containers
 API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1beta1,PodMetrics,Containers
diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go
index a857abcf155..3f130c0196c 100644
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@@ -585,6 +585,12 @@ const (
 	// Adds support for kubelet to detect node shutdown and gracefully terminate pods prior to the node being shutdown.
 	GracefulNodeShutdown featuregate.Feature = "GracefulNodeShutdown"
 
+	// owner: @wzshiming
+	// alpha: v1.23
+	//
+	// Make the kubelet use shutdown configuration based on pod priority values for graceful shutdown.
+	GracefulNodeShutdownBasedOnPodPriority featuregate.Feature = "GracefulNodeShutdownBasedOnPodPriority"
+
 	// owner: @andrewsykim @uablrek
 	// kep: http://kep.k8s.io/1864
 	// alpha: v1.20
@@ -920,6 +926,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 	ExecProbeTimeout:                               {Default: true, PreRelease: featuregate.GA}, // lock to default and remove after v1.22 based on KEP #1972 update
 	KubeletCredentialProviders:                     {Default: false, PreRelease: featuregate.Alpha},
 	GracefulNodeShutdown:                           {Default: true, PreRelease: featuregate.Beta},
+	GracefulNodeShutdownBasedOnPodPriority:         {Default: false, PreRelease: featuregate.Alpha},
 	ServiceLBNodePortControl:                       {Default: true, PreRelease: featuregate.Beta},
 	MixedProtocolLBService:                         {Default: false, PreRelease: featuregate.Alpha},
 	VolumeCapacityPriority:                         {Default: false, PreRelease: featuregate.Alpha},
diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go
index 4855b3e2a04..bf6557b70a3 100644
--- a/pkg/kubelet/apis/config/types.go
+++ b/pkg/kubelet/apis/config/types.go
@@ -397,6 +397,15 @@ type KubeletConfiguration struct {
 	// +featureGate=GracefulNodeShutdown
 	// +optional
 	ShutdownGracePeriodCriticalPods metav1.Duration
+	// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based
+	// on their associated priority class value.
+	// When a shutdown request is received, the Kubelet will initiate shutdown on all pods
+	// running on the node with a grace period that depends on the priority of the pod,
+	// and then wait for all pods to exit.
+	// Each entry in the array represents the graceful shutdown time a pod with a priority
+	// class value that lies in the range of that value and the next higher entry in the
+	// list when the node is shutting down.
+	ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority
 	// ReservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
 	// The parameter makes sense only in the context of the memory manager feature. The memory manager will not allocate reserved memory for container workloads.
 	// For example, if you have a NUMA0 with 10Gi of memory and the ReservedMemory was specified to reserve 1Gi of memory at NUMA0,
@@ -595,6 +604,14 @@ type MemoryReservation struct {
 	Limits   v1.ResourceList
 }
 
+// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value
+type ShutdownGracePeriodByPodPriority struct {
+	// priority is the priority value associated with the shutdown grace period
+	Priority int32
+	// shutdownGracePeriodSeconds is the shutdown grace period in seconds
+	ShutdownGracePeriodSeconds int64
+}
+
 type MemorySwapConfiguration struct {
 	// swapBehavior configures swap memory available to container workloads. May be one of
 	// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go
index 7c4dbe4203d..95659af0de0 100644
--- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go
+++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go
@@ -140,6 +140,16 @@ func RegisterConversions(s *runtime.Scheme) error {
 	}); err != nil {
 		return err
 	}
+	if err := s.AddGeneratedConversionFunc((*v1beta1.ShutdownGracePeriodByPodPriority)(nil), (*config.ShutdownGracePeriodByPodPriority)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(a.(*v1beta1.ShutdownGracePeriodByPodPriority), b.(*config.ShutdownGracePeriodByPodPriority), scope)
+	}); err != nil {
+		return err
+	}
+	if err := s.AddGeneratedConversionFunc((*config.ShutdownGracePeriodByPodPriority)(nil), (*v1beta1.ShutdownGracePeriodByPodPriority)(nil), func(a, b interface{}, scope conversion.Scope) error {
+		return Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(a.(*config.ShutdownGracePeriodByPodPriority), b.(*v1beta1.ShutdownGracePeriodByPodPriority), scope)
+	}); err != nil {
+		return err
+	}
 	return nil
 }
 
@@ -381,6 +391,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
 	}
 	out.ShutdownGracePeriod = in.ShutdownGracePeriod
 	out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
+	out.ShutdownGracePeriodByPodPriority = *(*[]config.ShutdownGracePeriodByPodPriority)(unsafe.Pointer(&in.ShutdownGracePeriodByPodPriority))
 	out.ReservedMemory = *(*[]config.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory))
 	if err := v1.Convert_Pointer_bool_To_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil {
 		return err
@@ -556,6 +567,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
 	}
 	out.ShutdownGracePeriod = in.ShutdownGracePeriod
 	out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
+	out.ShutdownGracePeriodByPodPriority = *(*[]v1beta1.ShutdownGracePeriodByPodPriority)(unsafe.Pointer(&in.ShutdownGracePeriodByPodPriority))
 	out.ReservedMemory = *(*[]v1beta1.MemoryReservation)(unsafe.Pointer(&in.ReservedMemory))
 	if err := v1.Convert_bool_To_Pointer_bool(&in.EnableProfilingHandler, &out.EnableProfilingHandler, s); err != nil {
 		return err
@@ -708,3 +720,25 @@ func autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConf
 func Convert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in *config.SerializedNodeConfigSource, out *v1beta1.SerializedNodeConfigSource, s conversion.Scope) error {
 	return autoConvert_config_SerializedNodeConfigSource_To_v1beta1_SerializedNodeConfigSource(in, out, s)
 }
+
+func autoConvert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in *v1beta1.ShutdownGracePeriodByPodPriority, out *config.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
+	out.Priority = in.Priority
+	out.ShutdownGracePeriodSeconds = in.ShutdownGracePeriodSeconds
+	return nil
+}
+
+// Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority is an autogenerated conversion function.
+func Convert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in *v1beta1.ShutdownGracePeriodByPodPriority, out *config.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
+	return autoConvert_v1beta1_ShutdownGracePeriodByPodPriority_To_config_ShutdownGracePeriodByPodPriority(in, out, s)
+}
+
+func autoConvert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in *config.ShutdownGracePeriodByPodPriority, out *v1beta1.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
+	out.Priority = in.Priority
+	out.ShutdownGracePeriodSeconds = in.ShutdownGracePeriodSeconds
+	return nil
+}
+
+// Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority is an autogenerated conversion function.
+func Convert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in *config.ShutdownGracePeriodByPodPriority, out *v1beta1.ShutdownGracePeriodByPodPriority, s conversion.Scope) error {
+	return autoConvert_config_ShutdownGracePeriodByPodPriority_To_v1beta1_ShutdownGracePeriodByPodPriority(in, out, s)
+}
diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go
index f483437bf2d..99c3c339335 100644
--- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go
+++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go
@@ -283,6 +283,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
 	in.Logging.DeepCopyInto(&out.Logging)
 	out.ShutdownGracePeriod = in.ShutdownGracePeriod
 	out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
+	if in.ShutdownGracePeriodByPodPriority != nil {
+		in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority
+		*out = make([]ShutdownGracePeriodByPodPriority, len(*in))
+		copy(*out, *in)
+	}
 	if in.ReservedMemory != nil {
 		in, out := &in.ReservedMemory, &out.ReservedMemory
 		*out = make([]MemoryReservation, len(*in))
@@ -438,3 +443,19 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object {
 	}
 	return nil
 }
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) {
+	*out = *in
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority.
+func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority {
+	if in == nil {
+		return nil
+	}
+	out := new(ShutdownGracePeriodByPodPriority)
+	in.DeepCopyInto(out)
+	return out
+}
diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
index 30eb22f7b77..223a88576c6 100644
--- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go
+++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go
@@ -989,6 +989,35 @@ type KubeletConfiguration struct {
 	// +featureGate=GracefulNodeShutdown
 	// +optional
 	ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"`
+	// shutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based
+	// on their associated priority class value.
+	// When a shutdown request is received, the Kubelet will initiate shutdown on all pods
+	// running on the node with a grace period that depends on the priority of the pod,
+	// and then wait for all pods to exit.
+	// Each entry in the array represents the graceful shutdown time a pod with a priority
+	// class value that lies in the range of that value and the next higher entry in the
+	// list when the node is shutting down.
+	// For example, to allow critical pods 10s to shutdown, priority>=10000 pods 20s to
+	// shutdown, and all remaining pods 30s to shutdown.
+	//
+	// shutdownGracePeriodByPodPriority:
+	//   - priority: 2000000000
+	//     shutdownGracePeriodSeconds: 10
+	//   - priority: 10000
+	//     shutdownGracePeriodSeconds: 20
+	//   - priority: 0
+	//     shutdownGracePeriodSeconds: 30
+	//
+	// The time the Kubelet will wait before exiting will at most be the maximum of all
+	// shutdownGracePeriodSeconds for each priority class range represented on the node.
+	// When all pods have exited or reached their grace periods, the Kubelet will release
+	// the shutdown inhibit lock.
+	// Requires the GracefulNodeShutdown feature gate to be enabled.
+	// This configuration must be empty if either ShutdownGracePeriod or ShutdownGracePeriodCriticalPods is set.
+	// Default: nil
+	// +featureGate=GracefulNodeShutdownBasedOnPodPriority
+	// +optional
+	ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority `json:"shutdownGracePeriodByPodPriority,omitempty"`
 	// reservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
 	// The parameter makes sense only in the context of the memory manager feature.
 	// The memory manager will not allocate reserved memory for container workloads.
@@ -1136,6 +1165,14 @@ type MemoryReservation struct {
 	Limits   v1.ResourceList `json:"limits"`
 }
 
+// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value
+type ShutdownGracePeriodByPodPriority struct {
+	// priority is the priority value associated with the shutdown grace period
+	Priority int32 `json:"priority"`
+	// shutdownGracePeriodSeconds is the shutdown grace period in seconds
+	ShutdownGracePeriodSeconds int64 `json:"shutdownGracePeriodSeconds"`
+}
+
 type MemorySwapConfiguration struct {
 	// swapBehavior configures swap memory available to container workloads. May be one of
 	// "", "LimitedSwap": workload combined memory and swap usage cannot exceed pod memory limit
diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go
index 114cba363f4..777c0588505 100644
--- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go
+++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go
@@ -318,6 +318,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
 	}
 	out.ShutdownGracePeriod = in.ShutdownGracePeriod
 	out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
+	if in.ShutdownGracePeriodByPodPriority != nil {
+		in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority
+		*out = make([]ShutdownGracePeriodByPodPriority, len(*in))
+		copy(*out, *in)
+	}
 	if in.ReservedMemory != nil {
 		in, out := &in.ReservedMemory, &out.ReservedMemory
 		*out = make([]MemoryReservation, len(*in))
@@ -498,3 +503,19 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object {
 	}
 	return nil
 }
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) {
+	*out = *in
+	return
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority.
+func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority {
+	if in == nil {
+		return nil
+	}
+	out := new(ShutdownGracePeriodByPodPriority)
+	in.DeepCopyInto(out)
+	return out
+}

From 545313bdc7393cefb9ffcc8fda3a054a20c6a8d4 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@foxmail.com>
Date: Wed, 27 Oct 2021 11:11:14 +0800
Subject: [PATCH 2/5] Implement graceful shutdown based on Pod priority

---
 pkg/kubelet/apis/config/helpers_test.go       |   2 +
 pkg/kubelet/kubelet.go                        |  17 +-
 .../nodeshutdown/nodeshutdown_manager.go      |  20 +-
 .../nodeshutdown_manager_linux.go             | 214 ++++++++++++-----
 .../nodeshutdown_manager_linux_test.go        | 221 ++++++++++++++++--
 5 files changed, 384 insertions(+), 90 deletions(-)

diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go
index 29361abd450..1a7725c984c 100644
--- a/pkg/kubelet/apis/config/helpers_test.go
+++ b/pkg/kubelet/apis/config/helpers_test.go
@@ -267,6 +267,8 @@ var (
 		"SeccompDefault",
 		"SerializeImagePulls",
 		"ShowHiddenMetricsForVersion",
+		"ShutdownGracePeriodByPodPriority[*].Priority",
+		"ShutdownGracePeriodByPodPriority[*].ShutdownGracePeriodSeconds",
 		"StreamingConnectionIdleTimeout.Duration",
 		"SyncFrequency.Duration",
 		"SystemCgroups",
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
index cbb79413977..2013c871a60 100644
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -867,14 +867,15 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 
 	// setup node shutdown manager
 	shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
-		ProbeManager:                    klet.probeManager,
-		Recorder:                        kubeDeps.Recorder,
-		NodeRef:                         nodeRef,
-		GetPodsFunc:                     klet.GetActivePods,
-		KillPodFunc:                     killPodNow(klet.podWorkers, kubeDeps.Recorder),
-		SyncNodeStatusFunc:              klet.syncNodeStatus,
-		ShutdownGracePeriodRequested:    kubeCfg.ShutdownGracePeriod.Duration,
-		ShutdownGracePeriodCriticalPods: kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
+		ProbeManager:                     klet.probeManager,
+		Recorder:                         kubeDeps.Recorder,
+		NodeRef:                          nodeRef,
+		GetPodsFunc:                      klet.GetActivePods,
+		KillPodFunc:                      killPodNow(klet.podWorkers, kubeDeps.Recorder),
+		SyncNodeStatusFunc:               klet.syncNodeStatus,
+		ShutdownGracePeriodRequested:     kubeCfg.ShutdownGracePeriod.Duration,
+		ShutdownGracePeriodCriticalPods:  kubeCfg.ShutdownGracePeriodCriticalPods.Duration,
+		ShutdownGracePeriodByPodPriority: kubeCfg.ShutdownGracePeriodByPodPriority,
 	})
 	klet.shutdownManager = shutdownManager
 	klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go
index 4467ef773b5..4e4cfbfe5e9 100644
--- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go
+++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager.go
@@ -21,6 +21,7 @@ import (
 
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/client-go/tools/record"
+	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	"k8s.io/kubernetes/pkg/kubelet/eviction"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 	"k8s.io/kubernetes/pkg/kubelet/prober"
@@ -36,15 +37,16 @@ type Manager interface {
 
 // Config represents Manager configuration
 type Config struct {
-	ProbeManager                    prober.Manager
-	Recorder                        record.EventRecorder
-	NodeRef                         *v1.ObjectReference
-	GetPodsFunc                     eviction.ActivePodsFunc
-	KillPodFunc                     eviction.KillPodFunc
-	SyncNodeStatusFunc              func()
-	ShutdownGracePeriodRequested    time.Duration
-	ShutdownGracePeriodCriticalPods time.Duration
-	Clock                           clock.Clock
+	ProbeManager                     prober.Manager
+	Recorder                         record.EventRecorder
+	NodeRef                          *v1.ObjectReference
+	GetPodsFunc                      eviction.ActivePodsFunc
+	KillPodFunc                      eviction.KillPodFunc
+	SyncNodeStatusFunc               func()
+	ShutdownGracePeriodRequested     time.Duration
+	ShutdownGracePeriodCriticalPods  time.Duration
+	ShutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
+	Clock                            clock.Clock
 }
 
 // managerStub is a fake node shutdown managerImpl .
diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go
index bc161387f95..bfbb841a1fa 100644
--- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go
+++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go
@@ -22,6 +22,7 @@ package nodeshutdown
 
 import (
 	"fmt"
+	"sort"
 	"sync"
 	"time"
 
@@ -29,13 +30,14 @@ import (
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/klog/v2"
+	"k8s.io/kubernetes/pkg/apis/scheduling"
 	"k8s.io/kubernetes/pkg/features"
+	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
 	"k8s.io/kubernetes/pkg/kubelet/eviction"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
 	"k8s.io/kubernetes/pkg/kubelet/prober"
-	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/utils/clock"
 )
 
@@ -66,8 +68,7 @@ type managerImpl struct {
 	nodeRef      *v1.ObjectReference
 	probeManager prober.Manager
 
-	shutdownGracePeriodRequested    time.Duration
-	shutdownGracePeriodCriticalPods time.Duration
+	shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
 
 	getPods        eviction.ActivePodsFunc
 	killPodFunc    eviction.KillPodFunc
@@ -84,28 +85,46 @@ type managerImpl struct {
 
 // NewManager returns a new node shutdown manager.
 func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
-	if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) ||
-		(conf.ShutdownGracePeriodRequested == 0 && conf.ShutdownGracePeriodCriticalPods == 0) {
+	if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) {
 		m := managerStub{}
 		return m, m
 	}
+
+	shutdownGracePeriodByPodPriority := conf.ShutdownGracePeriodByPodPriority
+	// Migration from the original configuration
+	if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) ||
+		len(shutdownGracePeriodByPodPriority) == 0 {
+		shutdownGracePeriodByPodPriority = migrateConfig(conf.ShutdownGracePeriodRequested, conf.ShutdownGracePeriodCriticalPods)
+	}
+
+	// Disable if the configuration is empty
+	if len(shutdownGracePeriodByPodPriority) == 0 {
+		m := managerStub{}
+		return m, m
+	}
+
+	// Sort by priority from low to high
+	sort.Slice(shutdownGracePeriodByPodPriority, func(i, j int) bool {
+		return shutdownGracePeriodByPodPriority[i].Priority < shutdownGracePeriodByPodPriority[j].Priority
+	})
+
 	if conf.Clock == nil {
 		conf.Clock = clock.RealClock{}
 	}
 	manager := &managerImpl{
-		probeManager:                    conf.ProbeManager,
-		recorder:                        conf.Recorder,
-		nodeRef:                         conf.NodeRef,
-		getPods:                         conf.GetPodsFunc,
-		killPodFunc:                     conf.KillPodFunc,
-		syncNodeStatus:                  conf.SyncNodeStatusFunc,
-		shutdownGracePeriodRequested:    conf.ShutdownGracePeriodRequested,
-		shutdownGracePeriodCriticalPods: conf.ShutdownGracePeriodCriticalPods,
-		clock:                           conf.Clock,
+		probeManager:                     conf.ProbeManager,
+		recorder:                         conf.Recorder,
+		nodeRef:                          conf.NodeRef,
+		getPods:                          conf.GetPodsFunc,
+		killPodFunc:                      conf.KillPodFunc,
+		syncNodeStatus:                   conf.SyncNodeStatusFunc,
+		shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority,
+		clock:                            conf.Clock,
 	}
 	klog.InfoS("Creating node shutdown manager",
 		"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
 		"shutdownGracePeriodCriticalPods", conf.ShutdownGracePeriodCriticalPods,
+		"shutdownGracePeriodByPodPriority", shutdownGracePeriodByPodPriority,
 	)
 	return manager, manager
 }
@@ -159,9 +178,9 @@ func (m *managerImpl) start() (chan struct{}, error) {
 		return nil, err
 	}
 
-	// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than shutdownGracePeriodRequested, attempt to update the value to shutdownGracePeriodRequested.
-	if m.shutdownGracePeriodRequested > currentInhibitDelay {
-		err := m.dbusCon.OverrideInhibitDelay(m.shutdownGracePeriodRequested)
+	// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than periodRequested, attempt to update the value to periodRequested.
+	if periodRequested := m.periodRequested(); periodRequested > currentInhibitDelay {
+		err := m.dbusCon.OverrideInhibitDelay(periodRequested)
 		if err != nil {
 			return nil, fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err)
 		}
@@ -177,8 +196,8 @@ func (m *managerImpl) start() (chan struct{}, error) {
 			return nil, err
 		}
 
-		if m.shutdownGracePeriodRequested > updatedInhibitDelay {
-			return nil, fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", m.shutdownGracePeriodRequested, updatedInhibitDelay)
+		if periodRequested > updatedInhibitDelay {
+			return nil, fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", periodRequested, updatedInhibitDelay)
 		}
 	}
 
@@ -270,54 +289,54 @@ func (m *managerImpl) processShutdownEvent() error {
 	klog.V(1).InfoS("Shutdown manager processing shutdown event")
 	activePods := m.getPods()
 
-	nonCriticalPodGracePeriod := m.shutdownGracePeriodRequested - m.shutdownGracePeriodCriticalPods
+	groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods)
+	for _, group := range groups {
+		// If there are no pods in a particular range,
+		// then do not wait for pods in that priority range.
+		if len(group.Pods) == 0 {
+			continue
+		}
 
-	var wg sync.WaitGroup
-	wg.Add(len(activePods))
-	for _, pod := range activePods {
-		go func(pod *v1.Pod) {
-			defer wg.Done()
+		var wg sync.WaitGroup
+		wg.Add(len(group.Pods))
+		for _, pod := range group.Pods {
+			go func(pod *v1.Pod, group podShutdownGroup) {
+				defer wg.Done()
 
-			var gracePeriodOverride int64
-			if kubelettypes.IsCriticalPod(pod) {
-				gracePeriodOverride = int64(m.shutdownGracePeriodCriticalPods.Seconds())
-				m.clock.Sleep(nonCriticalPodGracePeriod)
-			} else {
-				gracePeriodOverride = int64(nonCriticalPodGracePeriod.Seconds())
-			}
+				gracePeriodOverride := group.ShutdownGracePeriodSeconds
 
-			// Stop probes for the pod
-			m.probeManager.RemovePod(pod)
+				// Stop probes for the pod
+				m.probeManager.RemovePod(pod)
 
-			// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
-			if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
-				gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
-			}
+				// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
+				if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
+					gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
+				}
 
-			klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride)
-			if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) {
-				status.Message = nodeShutdownMessage
-				status.Reason = nodeShutdownReason
-			}); err != nil {
-				klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err)
-			} else {
-				klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod))
-			}
-		}(pod)
-	}
+				klog.V(1).InfoS("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride)
 
-	c := make(chan struct{})
-	go func() {
-		defer close(c)
-		wg.Wait()
-	}()
+				if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) {
+					status.Message = nodeShutdownMessage
+					status.Reason = nodeShutdownReason
+				}); err != nil {
+					klog.V(1).InfoS("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err)
+				} else {
+					klog.V(1).InfoS("Shutdown manager finished killing pod", "pod", klog.KObj(pod))
+				}
+			}(pod, group)
+		}
 
-	// We want to ensure that inhibitLock is released, so only wait up to the shutdownGracePeriodRequested timeout.
-	select {
-	case <-c:
-		break
-	case <-time.After(m.shutdownGracePeriodRequested):
-		klog.V(1).InfoS("Shutdown manager pod killing time out", "gracePeriod", m.shutdownGracePeriodRequested)
+		c := make(chan struct{})
+		go func() {
+			defer close(c)
+			wg.Wait()
+		}()
+
+		select {
+		case <-c:
+		case <-time.After(time.Duration(group.ShutdownGracePeriodSeconds) * time.Second):
+			klog.V(1).InfoS("Shutdown manager pod killing time out", "gracePeriod", group.ShutdownGracePeriodSeconds, "priority", group.Priority)
+		}
 	}
 
 	m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
@@ -325,3 +344,78 @@ func (m *managerImpl) processShutdownEvent() error {
 
 	return nil
 }
+
+func (m *managerImpl) periodRequested() time.Duration {
+	var sum int64
+	for _, period := range m.shutdownGracePeriodByPodPriority {
+		sum += period.ShutdownGracePeriodSeconds
+	}
+	return time.Duration(sum) * time.Second
+}
+
+func migrateConfig(shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) []kubeletconfig.ShutdownGracePeriodByPodPriority {
+	if shutdownGracePeriodRequested == 0 {
+		return nil
+	}
+	defaultPriority := shutdownGracePeriodRequested - shutdownGracePeriodCriticalPods
+	if defaultPriority < 0 {
+		return nil
+	}
+	criticalPriority := shutdownGracePeriodRequested - defaultPriority
+	if criticalPriority < 0 {
+		return nil
+	}
+	return []kubeletconfig.ShutdownGracePeriodByPodPriority{
+		{
+			Priority:                   scheduling.DefaultPriorityWhenNoDefaultClassExists,
+			ShutdownGracePeriodSeconds: int64(defaultPriority / time.Second),
+		},
+		{
+			Priority:                   scheduling.SystemCriticalPriority,
+			ShutdownGracePeriodSeconds: int64(criticalPriority / time.Second),
+		},
+	}
+}
+
+func groupByPriority(shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority, pods []*v1.Pod) []podShutdownGroup {
+	groups := make([]podShutdownGroup, 0, len(shutdownGracePeriodByPodPriority))
+	for _, period := range shutdownGracePeriodByPodPriority {
+		groups = append(groups, podShutdownGroup{
+			ShutdownGracePeriodByPodPriority: period,
+		})
+	}
+
+	for _, pod := range pods {
+		var priority int32
+		if pod.Spec.Priority != nil {
+			priority = *pod.Spec.Priority
+		}
+
+		// Find the group index according to the priority.
+		index := sort.Search(len(groups), func(i int) bool {
+			return groups[i].Priority >= priority
+		})
+
+		// 1. Those higher than the highest priority default to the highest priority
+		// 2. Those lower than the lowest priority default to the lowest priority
+		// 3. Those boundary priority default to the lower priority
+		// if priority of pod is:
+		//   groups[index-1].Priority <= pod priority < groups[index].Priority
+		// in which case we want to pick lower one (i.e index-1)
+		if index == len(groups) {
+			index = len(groups) - 1
+		} else if index < 0 {
+			index = 0
+		} else if index > 0 && groups[index].Priority > priority {
+			index--
+		}
+
+		groups[index].Pods = append(groups[index].Pods, pod)
+	}
+	return groups
+}
+
+type podShutdownGroup struct {
+	kubeletconfig.ShutdownGracePeriodByPodPriority
+	Pods []*v1.Pod
+}
diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go
index dc0fd9bf10f..028364c2367 100644
--- a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go
+++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go
@@ -35,6 +35,7 @@ import (
 	featuregatetesting "k8s.io/component-base/featuregate/testing"
 	"k8s.io/kubernetes/pkg/apis/scheduling"
 	pkgfeatures "k8s.io/kubernetes/pkg/features"
+	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
 	probetest "k8s.io/kubernetes/pkg/kubelet/prober/testing"
 	testingclock "k8s.io/utils/clock/testing"
@@ -81,12 +82,7 @@ func (f *fakeDbus) OverrideInhibitDelay(inhibitDelayMax time.Duration) error {
 	return nil
 }
 
-func makePod(name string, criticalPod bool, terminationGracePeriod *int64) *v1.Pod {
-	var priority int32
-	if criticalPod {
-		priority = scheduling.SystemCriticalPriority
-	}
-
+func makePod(name string, priority int32, terminationGracePeriod *int64) *v1.Pod {
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: name,
@@ -104,15 +100,15 @@ func TestManager(t *testing.T) {
 	defer func() {
 		systemDbus = systemDbusTmp
 	}()
-	normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", false /* criticalPod */, nil /* terminationGracePeriod */)
-	criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", true /* criticalPod */, nil /* terminationGracePeriod */)
+	normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil /* terminationGracePeriod */)
+	criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", scheduling.SystemCriticalPriority, nil /* terminationGracePeriod */)
 
 	shortGracePeriod := int64(2)
-	normalPodGracePeriod := makePod("normal-pod-grace-period", false /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */)
-	criticalPodGracePeriod := makePod("critical-pod-grace-period", true /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */)
+	normalPodGracePeriod := makePod("normal-pod-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, &shortGracePeriod /* terminationGracePeriod */)
+	criticalPodGracePeriod := makePod("critical-pod-grace-period", scheduling.SystemCriticalPriority, &shortGracePeriod /* terminationGracePeriod */)
 
 	longGracePeriod := int64(1000)
-	normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", false /* criticalPod */, &longGracePeriod /* terminationGracePeriod */)
+	normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", scheduling.DefaultPriorityWhenNoDefaultClassExists, &longGracePeriod /* terminationGracePeriod */)
 
 	var tests = []struct {
 		desc                             string
@@ -256,7 +252,9 @@ func TestManager(t *testing.T) {
 			lock.Unlock()
 
 			if tc.expectedError != nil {
-				if !strings.Contains(err.Error(), tc.expectedError.Error()) {
+				if err == nil {
+					t.Errorf("unexpected error message. Got: <nil> want %s", tc.expectedError.Error())
+				} else if !strings.Contains(err.Error(), tc.expectedError.Error()) {
 					t.Errorf("unexpected error message. Got: %s want %s", err.Error(), tc.expectedError.Error())
 				}
 			} else {
@@ -266,7 +264,11 @@ func TestManager(t *testing.T) {
 				assert.Equal(t, manager.Admit(nil).Admit, true)
 
 				// Send fake shutdown event
-				fakeShutdownChan <- true
+				select {
+				case fakeShutdownChan <- true:
+				case <-time.After(1 * time.Second):
+					t.Fatal()
+				}
 
 				// Wait for all the pods to be killed
 				killedPodsToGracePeriods := map[string]int64{}
@@ -413,3 +415,196 @@ func TestRestart(t *testing.T) {
 		shutdownChanMut.Unlock()
 	}
 }
+
+func Test_migrateConfig(t *testing.T) {
+	type shutdownConfig struct {
+		shutdownGracePeriodRequested    time.Duration
+		shutdownGracePeriodCriticalPods time.Duration
+	}
+	tests := []struct {
+		name string
+		args shutdownConfig
+		want []kubeletconfig.ShutdownGracePeriodByPodPriority
+	}{
+		{
+			name: "both shutdownGracePeriodRequested and shutdownGracePeriodCriticalPods",
+			args: shutdownConfig{
+				shutdownGracePeriodRequested:    300 * time.Second,
+				shutdownGracePeriodCriticalPods: 120 * time.Second,
+			},
+			want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
+				{
+					Priority:                   scheduling.DefaultPriorityWhenNoDefaultClassExists,
+					ShutdownGracePeriodSeconds: 180,
+				},
+				{
+					Priority:                   scheduling.SystemCriticalPriority,
+					ShutdownGracePeriodSeconds: 120,
+				},
+			},
+		},
+		{
+			name: "only shutdownGracePeriodRequested",
+			args: shutdownConfig{
+				shutdownGracePeriodRequested:    100 * time.Second,
+				shutdownGracePeriodCriticalPods: 0 * time.Second,
+			},
+			want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
+				{
+					Priority:                   scheduling.DefaultPriorityWhenNoDefaultClassExists,
+					ShutdownGracePeriodSeconds: 100,
+				},
+				{
+					Priority:                   scheduling.SystemCriticalPriority,
+					ShutdownGracePeriodSeconds: 0,
+				},
+			},
+		},
+		{
+			name: "empty configuration",
+			args: shutdownConfig{
+				shutdownGracePeriodRequested:    0 * time.Second,
+				shutdownGracePeriodCriticalPods: 0 * time.Second,
+			},
+			want: nil,
+		},
+		{
+			name: "wrong configuration",
+			args: shutdownConfig{
+				shutdownGracePeriodRequested:    1 * time.Second,
+				shutdownGracePeriodCriticalPods: 100 * time.Second,
+			},
+			want: nil,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := migrateConfig(tt.args.shutdownGracePeriodRequested, tt.args.shutdownGracePeriodCriticalPods); !assert.Equal(t, tt.want, got) {
+				t.Errorf("migrateConfig() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_groupByPriority(t *testing.T) {
+	type args struct {
+		shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
+		pods                             []*v1.Pod
+	}
+	tests := []struct {
+		name string
+		args args
+		want []podShutdownGroup
+	}{
+		{
+			name: "migrate config",
+			args: args{
+				shutdownGracePeriodByPodPriority: migrateConfig(300*time.Second /* shutdownGracePeriodRequested */, 120*time.Second /* shutdownGracePeriodCriticalPods */),
+				pods: []*v1.Pod{
+					makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
+					makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
+					makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
+				},
+			},
+			want: []podShutdownGroup{
+				{
+					ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
+						Priority:                   scheduling.DefaultPriorityWhenNoDefaultClassExists,
+						ShutdownGracePeriodSeconds: 180,
+					},
+					Pods: []*v1.Pod{
+						makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
+						makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
+					},
+				},
+				{
+					ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
+						Priority:                   scheduling.SystemCriticalPriority,
+						ShutdownGracePeriodSeconds: 120,
+					},
+					Pods: []*v1.Pod{
+						makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
+					},
+				},
+			},
+		},
+		{
+			name: "pod priority",
+			args: args{
+				shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
+					{
+						Priority:                   1,
+						ShutdownGracePeriodSeconds: 10,
+					},
+					{
+						Priority:                   2,
+						ShutdownGracePeriodSeconds: 20,
+					},
+					{
+						Priority:                   3,
+						ShutdownGracePeriodSeconds: 30,
+					},
+					{
+						Priority:                   4,
+						ShutdownGracePeriodSeconds: 40,
+					},
+				},
+				pods: []*v1.Pod{
+					makePod("pod-0", 0, nil),
+					makePod("pod-1", 1, nil),
+					makePod("pod-2", 2, nil),
+					makePod("pod-3", 3, nil),
+					makePod("pod-4", 4, nil),
+					makePod("pod-5", 5, nil),
+				},
+			},
+			want: []podShutdownGroup{
+				{
+					ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
+						Priority:                   1,
+						ShutdownGracePeriodSeconds: 10,
+					},
+					Pods: []*v1.Pod{
+						makePod("pod-0", 0, nil),
+						makePod("pod-1", 1, nil),
+					},
+				},
+				{
+					ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
+						Priority:                   2,
+						ShutdownGracePeriodSeconds: 20,
+					},
+					Pods: []*v1.Pod{
+						makePod("pod-2", 2, nil),
+					},
+				},
+				{
+					ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
+						Priority:                   3,
+						ShutdownGracePeriodSeconds: 30,
+					},
+					Pods: []*v1.Pod{
+						makePod("pod-3", 3, nil),
+					},
+				},
+				{
+					ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
+						Priority:                   4,
+						ShutdownGracePeriodSeconds: 40,
+					},
+					Pods: []*v1.Pod{
+						makePod("pod-4", 4, nil),
+						makePod("pod-5", 5, nil),
+					},
+				},
+			},
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := groupByPriority(tt.args.shutdownGracePeriodByPodPriority, tt.args.pods); !assert.Equal(t, tt.want, got) {
+				t.Errorf("groupByPriority() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}

From 7a6f792ff39959d1c00d57dbf969901a7d323df9 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@foxmail.com>
Date: Thu, 11 Nov 2021 14:19:02 +0800
Subject: [PATCH 3/5] Add validation for GracefulNodeShutdownBasedOnPodPriority

Co-authored-by: Elana Hashman <ehashman@users.noreply.github.com>
---
 .../apis/config/validation/validation.go      | 10 ++++++
 .../apis/config/validation/validation_test.go | 34 +++++++++++++------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/pkg/kubelet/apis/config/validation/validation.go b/pkg/kubelet/apis/config/validation/validation.go
index 585e6639af5..7a89f87ddd7 100644
--- a/pkg/kubelet/apis/config/validation/validation.go
+++ b/pkg/kubelet/apis/config/validation/validation.go
@@ -166,6 +166,16 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error
 	if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) {
 		allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown"))
 	}
+	if localFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) {
+		if len(kc.ShutdownGracePeriodByPodPriority) != 0 && (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) {
+			allErrors = append(allErrors, fmt.Errorf("invalid configuration: Cannot specify both shutdownGracePeriodByPodPriority and shutdownGracePeriod at the same time"))
+		}
+	}
+	if !localFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) {
+		if len(kc.ShutdownGracePeriodByPodPriority) != 0 {
+			allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying shutdownGracePeriodByPodPriority requires feature gate GracefulNodeShutdownBasedOnPodPriority"))
+		}
+	}
 	if localFeatureGate.Enabled(features.NodeSwap) {
 		if kc.MemorySwap.SwapBehavior != "" && kc.MemorySwap.SwapBehavior != kubetypes.LimitedSwap && kc.MemorySwap.SwapBehavior != kubetypes.UnlimitedSwap {
 			allErrors = append(allErrors, fmt.Errorf("invalid configuration: MemorySwap.SwapBehavior %v must be one of: LimitedSwap, UnlimitedSwap", kc.MemorySwap.SwapBehavior))
diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go
index eb346dc6a98..2128cbbe423 100644
--- a/pkg/kubelet/apis/config/validation/validation_test.go
+++ b/pkg/kubelet/apis/config/validation/validation_test.go
@@ -108,8 +108,9 @@ func TestValidateKubeletConfiguration(t *testing.T) {
 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
 		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.9),
 		FeatureGates: map[string]bool{
-			"CustomCPUCFSQuotaPeriod": true,
-			"MemoryQoS":               true,
+			"CustomCPUCFSQuotaPeriod":                true,
+			"MemoryQoS":                              true,
+			"GracefulNodeShutdownBasedOnPodPriority": true,
 		},
 		Logging: componentbaseconfig.LoggingConfiguration{
 			Format: "text",
@@ -149,15 +150,22 @@ func TestValidateKubeletConfiguration(t *testing.T) {
 		ReservedSystemCPUs:              "0-3",
 		TopologyManagerScope:            kubeletconfig.ContainerTopologyManagerScope,
 		TopologyManagerPolicy:           kubeletconfig.NoneTopologyManagerPolicy,
-		ShutdownGracePeriod:             metav1.Duration{Duration: 10 * time.Minute},
+		ShutdownGracePeriod:             metav1.Duration{Duration: 0},
 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
-		MemorySwap:                      kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
-		MemoryThrottlingFactor:          utilpointer.Float64Ptr(0.5),
+		ShutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
+			{
+				Priority:                   0,
+				ShutdownGracePeriodSeconds: 10,
+			},
+		},
+		MemorySwap:             kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
+		MemoryThrottlingFactor: utilpointer.Float64Ptr(0.5),
 		FeatureGates: map[string]bool{
-			"CustomCPUCFSQuotaPeriod": true,
-			"GracefulNodeShutdown":    true,
-			"NodeSwap":                true,
-			"MemoryQoS":               true,
+			"CustomCPUCFSQuotaPeriod":                true,
+			"GracefulNodeShutdown":                   true,
+			"GracefulNodeShutdownBasedOnPodPriority": true,
+			"NodeSwap":                               true,
+			"MemoryQoS":                              true,
 		},
 		Logging: componentbaseconfig.LoggingConfiguration{
 			Format: "text",
@@ -194,12 +202,18 @@ func TestValidateKubeletConfiguration(t *testing.T) {
 		CPUCFSQuotaPeriod:               metav1.Duration{Duration: 100 * time.Millisecond},
 		ShutdownGracePeriod:             metav1.Duration{Duration: 30 * time.Second},
 		ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 60 * time.Second},
+		ShutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
+			{
+				Priority:                   0,
+				ShutdownGracePeriodSeconds: 10,
+			},
+		},
 		Logging: componentbaseconfig.LoggingConfiguration{
 			Format: "",
 		},
 		MemorySwap: kubeletconfig.MemorySwapConfiguration{SwapBehavior: kubetypes.UnlimitedSwap},
 	}
-	const numErrsErrorCase1 = 30
+	const numErrsErrorCase1 = 31
 	if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 {
 		t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors()))
 	}

From df7e4c1a3d7916f37c98265d27a307b4a91fbb28 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@foxmail.com>
Date: Fri, 12 Nov 2021 16:49:33 +0800
Subject: [PATCH 4/5] Add e2e for GracefulNodeShutdownBasedOnPodPriority

---
 test/e2e_node/node_shutdown_linux_test.go | 209 ++++++++++++++++++++--
 1 file changed, 198 insertions(+), 11 deletions(-)

diff --git a/test/e2e_node/node_shutdown_linux_test.go b/test/e2e_node/node_shutdown_linux_test.go
index 558b0cb782a..ff616aeb949 100644
--- a/test/e2e_node/node_shutdown_linux_test.go
+++ b/test/e2e_node/node_shutdown_linux_test.go
@@ -35,13 +35,15 @@ import (
 	"k8s.io/kubernetes/test/e2e/framework"
 
 	v1 "k8s.io/api/core/v1"
+	schedulingv1 "k8s.io/api/scheduling/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/kubernetes/pkg/features"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 	testutils "k8s.io/kubernetes/test/utils"
 )
 
-var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown]", func() {
+var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown] [NodeFeature:GracefulNodeShutdownBasedOnPodPriority]", func() {
 	f := framework.NewDefaultFramework("graceful-node-shutdown")
 	ginkgo.Context("when gracefully shutting down", func() {
 
@@ -54,6 +56,10 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 		)
 
 		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
+			initialConfig.FeatureGates = map[string]bool{
+				string(features.GracefulNodeShutdown):                   true,
+				string(features.GracefulNodeShutdownBasedOnPodPriority): false,
+			}
 			initialConfig.ShutdownGracePeriod = metav1.Duration{Duration: nodeShutdownGracePeriod}
 			initialConfig.ShutdownGracePeriodCriticalPods = metav1.Duration{Duration: nodeShutdownGracePeriodCriticalPods}
 		})
@@ -77,10 +83,10 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 
 			// Define test pods
 			pods := []*v1.Pod{
-				getGracePeriodOverrideTestPod("period-120", nodeName, 120, false),
-				getGracePeriodOverrideTestPod("period-5", nodeName, 5, false),
-				getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, true),
-				getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, true),
+				getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""),
+				getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""),
+				getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical),
+				getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical),
 			}
 
 			ginkgo.By("Creating batch pods")
@@ -117,12 +123,12 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 				for _, pod := range list.Items {
 					if kubelettypes.IsCriticalPod(&pod) {
 						if isPodShutdown(&pod) {
-							framework.Logf("Expecting critcal pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
+							framework.Logf("Expecting critical pod to be running, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
 							return fmt.Errorf("critical pod should not be shutdown, phase: %s", pod.Status.Phase)
 						}
 					} else {
 						if !isPodShutdown(&pod) {
-							framework.Logf("Expecting non-critcal pod to be shutdown, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
+							framework.Logf("Expecting non-critical pod to be shutdown, but it's not currently. Pod: %q, Pod Status %+v", pod.Name, pod.Status)
 							return fmt.Errorf("pod should be shutdown, phase: %s", pod.Status.Phase)
 						}
 					}
@@ -207,9 +213,190 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 			}, nodeStatusUpdateTimeout, pollInterval).Should(gomega.BeNil())
 		})
 	})
+
+	ginkgo.Context("when gracefully shutting down with Pod priority", func() {
+
+		const (
+			pollInterval           = 1 * time.Second
+			podStatusUpdateTimeout = 10 * time.Second
+		)
+
+		var (
+			customClassA = getPriorityClass("custom-class-a", 100000)
+			customClassB = getPriorityClass("custom-class-b", 10000)
+			customClassC = getPriorityClass("custom-class-c", 1000)
+		)
+
+		tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
+			initialConfig.FeatureGates = map[string]bool{
+				string(features.GracefulNodeShutdown):                   true,
+				string(features.GracefulNodeShutdownBasedOnPodPriority): true,
+			}
+			initialConfig.ShutdownGracePeriodByPodPriority = []kubeletconfig.ShutdownGracePeriodByPodPriority{
+				{
+					Priority:                   scheduling.SystemCriticalPriority,
+					ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
+				},
+				{
+					Priority:                   customClassA.Value,
+					ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
+				},
+				{
+					Priority:                   customClassB.Value,
+					ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
+				},
+				{
+					Priority:                   customClassC.Value,
+					ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
+				},
+				{
+					Priority:                   scheduling.DefaultPriorityWhenNoDefaultClassExists,
+					ShutdownGracePeriodSeconds: int64(podStatusUpdateTimeout / time.Second),
+				},
+			}
+
+		})
+
+		ginkgo.BeforeEach(func() {
+			ginkgo.By("Wait for the node to be ready")
+			waitForNodeReady()
+
+			for _, customClass := range []*schedulingv1.PriorityClass{customClassA, customClassB, customClassC} {
+				_, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(context.Background(), customClass, metav1.CreateOptions{})
+				framework.ExpectNoError(err)
+			}
+		})
+
+		ginkgo.AfterEach(func() {
+			ginkgo.By("Emitting Shutdown false signal; cancelling the shutdown")
+			err := emitSignalPrepareForShutdown(false)
+			framework.ExpectNoError(err)
+		})
+
+		ginkgo.It("should be able to gracefully shutdown pods with various grace periods", func() {
+			nodeName := getNodeName(f)
+			nodeSelector := fields.Set{
+				"spec.nodeName": nodeName,
+			}.AsSelector().String()
+
+			// Define test pods
+			pods := []*v1.Pod{
+				getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""),
+				getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""),
+				getGracePeriodOverrideTestPod("period-c-120", nodeName, 120, customClassC.Name),
+				getGracePeriodOverrideTestPod("period-c-5", nodeName, 5, customClassC.Name),
+				getGracePeriodOverrideTestPod("period-b-120", nodeName, 120, customClassB.Name),
+				getGracePeriodOverrideTestPod("period-b-5", nodeName, 5, customClassB.Name),
+				getGracePeriodOverrideTestPod("period-a-120", nodeName, 120, customClassA.Name),
+				getGracePeriodOverrideTestPod("period-a-5", nodeName, 5, customClassA.Name),
+				getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical),
+				getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical),
+			}
+
+			// Expected down steps
+			downSteps := [][]string{
+				{
+					"period-5", "period-120",
+				},
+				{
+					"period-5", "period-120",
+					"period-c-5", "period-c-120",
+				},
+				{
+					"period-5", "period-120",
+					"period-c-5", "period-c-120",
+					"period-b-5", "period-b-120",
+				},
+				{
+					"period-5", "period-120",
+					"period-c-5", "period-c-120",
+					"period-b-5", "period-b-120",
+					"period-a-5", "period-a-120",
+				},
+				{
+					"period-5", "period-120",
+					"period-c-5", "period-c-120",
+					"period-b-5", "period-b-120",
+					"period-a-5", "period-a-120",
+					"period-critical-5", "period-critical-120",
+				},
+			}
+
+			ginkgo.By("Creating batch pods")
+			f.PodClient().CreateBatch(pods)
+
+			list, err := f.PodClient().List(context.TODO(), metav1.ListOptions{
+				FieldSelector: nodeSelector,
+			})
+			framework.ExpectNoError(err)
+			framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
+
+			ginkgo.By("Verifying batch pods are running")
+			for _, pod := range list.Items {
+				if podReady, err := testutils.PodRunningReady(&pod); err != nil || !podReady {
+					framework.Failf("Failed to start batch pod: %v", pod.Name)
+				}
+			}
+
+			ginkgo.By("Emitting shutdown signal")
+			err = emitSignalPrepareForShutdown(true)
+			framework.ExpectNoError(err)
+
+			ginkgo.By("Verifying that pods are shutdown")
+
+			for _, step := range downSteps {
+				gomega.Eventually(func() error {
+					list, err = f.PodClient().List(context.TODO(), metav1.ListOptions{
+						FieldSelector: nodeSelector,
+					})
+					if err != nil {
+						return err
+					}
+					framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
+					for _, pod := range list.Items {
+						shouldShutdown := false
+						for _, podName := range step {
+							if podName == pod.Name {
+								shouldShutdown = true
+								break
+							}
+						}
+						if !shouldShutdown {
+							if pod.Status.Phase != v1.PodRunning {
+								framework.Logf("Expecting pod to be running, but it's not currently. Pod: %q, Pod Status Phase: %q, Pod Status Reason: %q", pod.Name, pod.Status.Phase, pod.Status.Reason)
+								return fmt.Errorf("pod should not be shutdown, phase: %s, reason: %s", pod.Status.Phase, pod.Status.Reason)
+							}
+						} else {
+							if pod.Status.Reason != podShutdownReason {
+								framework.Logf("Expecting pod to be shutdown, but it's not currently. Pod: %q, Pod Status Phase: %q, Pod Status Reason: %q", pod.Name, pod.Status.Phase, pod.Status.Reason)
+								for _, item := range list.Items {
+									framework.Logf("DEBUG %s, %s, %s", item.Name, item.Status.Phase, pod.Status.Reason)
+								}
+								return fmt.Errorf("pod should be shutdown, reason: %s", pod.Status.Reason)
+							}
+						}
+					}
+					return nil
+				}, podStatusUpdateTimeout, pollInterval).Should(gomega.BeNil())
+			}
+		})
+	})
 })
 
-func getGracePeriodOverrideTestPod(name string, node string, gracePeriod int64, critical bool) *v1.Pod {
+func getPriorityClass(name string, value int32) *schedulingv1.PriorityClass {
+	priority := &schedulingv1.PriorityClass{
+		TypeMeta: metav1.TypeMeta{
+			Kind:       "PriorityClass",
+			APIVersion: "scheduling.k8s.io/v1",
+		},
+		ObjectMeta: metav1.ObjectMeta{
+			Name: name,
+		},
+		Value: value,
+	}
+	return priority
+}
+func getGracePeriodOverrideTestPod(name string, node string, gracePeriod int64, priorityClassName string) *v1.Pod {
 	pod := &v1.Pod{
 		TypeMeta: metav1.TypeMeta{
 			Kind:       "Pod",
@@ -238,14 +425,14 @@ while true; do sleep 5; done
 			NodeName:                      node,
 		},
 	}
-	if critical {
+	if priorityClassName == scheduling.SystemNodeCritical {
 		pod.ObjectMeta.Annotations = map[string]string{
 			kubelettypes.ConfigSourceAnnotationKey: kubelettypes.FileSource,
 		}
-		pod.Spec.PriorityClassName = scheduling.SystemNodeCritical
-
+		pod.Spec.PriorityClassName = priorityClassName
 		framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), true, "pod should be a critical pod")
 	} else {
+		pod.Spec.PriorityClassName = priorityClassName
 		framework.ExpectEqual(kubelettypes.IsCriticalPod(pod), false, "pod should not be a critical pod")
 	}
 	return pod

From 7c656b55e49078918bebb7b58d8b9463c7f79fa3 Mon Sep 17 00:00:00 2001
From: Shiming Zhang <wzshiming@foxmail.com>
Date: Fri, 12 Nov 2021 16:50:18 +0800
Subject: [PATCH 5/5] Update shutdown cases

---
 test/e2e_node/node_shutdown_linux_test.go | 35 ++++++++++-------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/test/e2e_node/node_shutdown_linux_test.go b/test/e2e_node/node_shutdown_linux_test.go
index ff616aeb949..eca0e681513 100644
--- a/test/e2e_node/node_shutdown_linux_test.go
+++ b/test/e2e_node/node_shutdown_linux_test.go
@@ -281,44 +281,39 @@ var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShut
 
 			// Define test pods
 			pods := []*v1.Pod{
-				getGracePeriodOverrideTestPod("period-120", nodeName, 120, ""),
 				getGracePeriodOverrideTestPod("period-5", nodeName, 5, ""),
-				getGracePeriodOverrideTestPod("period-c-120", nodeName, 120, customClassC.Name),
 				getGracePeriodOverrideTestPod("period-c-5", nodeName, 5, customClassC.Name),
-				getGracePeriodOverrideTestPod("period-b-120", nodeName, 120, customClassB.Name),
 				getGracePeriodOverrideTestPod("period-b-5", nodeName, 5, customClassB.Name),
-				getGracePeriodOverrideTestPod("period-a-120", nodeName, 120, customClassA.Name),
 				getGracePeriodOverrideTestPod("period-a-5", nodeName, 5, customClassA.Name),
-				getGracePeriodOverrideTestPod("period-critical-120", nodeName, 120, scheduling.SystemNodeCritical),
 				getGracePeriodOverrideTestPod("period-critical-5", nodeName, 5, scheduling.SystemNodeCritical),
 			}
 
 			// Expected down steps
 			downSteps := [][]string{
 				{
-					"period-5", "period-120",
+					"period-5",
 				},
 				{
-					"period-5", "period-120",
-					"period-c-5", "period-c-120",
+					"period-5",
+					"period-c-5",
 				},
 				{
-					"period-5", "period-120",
-					"period-c-5", "period-c-120",
-					"period-b-5", "period-b-120",
+					"period-5",
+					"period-c-5",
+					"period-b-5",
 				},
 				{
-					"period-5", "period-120",
-					"period-c-5", "period-c-120",
-					"period-b-5", "period-b-120",
-					"period-a-5", "period-a-120",
+					"period-5",
+					"period-c-5",
+					"period-b-5",
+					"period-a-5",
 				},
 				{
-					"period-5", "period-120",
-					"period-c-5", "period-c-120",
-					"period-b-5", "period-b-120",
-					"period-a-5", "period-a-120",
-					"period-critical-5", "period-critical-120",
+					"period-5",
+					"period-c-5",
+					"period-b-5",
+					"period-a-5",
+					"period-critical-5",
 				},
 			}