mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-11 13:02:14 +00:00
Refactor node shutdown manager
Signed-off-by: Maksym Pavlenko <pavlenko.maksym@gmail.com>
This commit is contained in:
parent
352056f09d
commit
449f86b0ba
@ -929,7 +929,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
|||||||
util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName)))
|
util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName)))
|
||||||
|
|
||||||
// setup node shutdown manager
|
// setup node shutdown manager
|
||||||
shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
|
shutdownManager := nodeshutdown.NewManager(&nodeshutdown.Config{
|
||||||
Logger: logger,
|
Logger: logger,
|
||||||
ProbeManager: klet.probeManager,
|
ProbeManager: klet.probeManager,
|
||||||
VolumeManager: klet.volumeManager,
|
VolumeManager: klet.volumeManager,
|
||||||
@ -948,7 +948,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("create user namespace manager: %w", err)
|
return nil, fmt.Errorf("create user namespace manager: %w", err)
|
||||||
}
|
}
|
||||||
klet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
|
klet.admitHandlers.AddPodAdmitHandler(shutdownManager)
|
||||||
|
|
||||||
// Finally, put the most recent version of the config on the Kubelet, so
|
// Finally, put the most recent version of the config on the Kubelet, so
|
||||||
// people can see how it was configured.
|
// people can see how it was configured.
|
||||||
|
@ -351,7 +351,7 @@ func newTestKubeletWithImageList(
|
|||||||
kubelet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
|
kubelet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
|
||||||
|
|
||||||
// setup shutdown manager
|
// setup shutdown manager
|
||||||
shutdownManager, shutdownAdmitHandler := nodeshutdown.NewManager(&nodeshutdown.Config{
|
shutdownManager := nodeshutdown.NewManager(&nodeshutdown.Config{
|
||||||
Logger: logger,
|
Logger: logger,
|
||||||
ProbeManager: kubelet.probeManager,
|
ProbeManager: kubelet.probeManager,
|
||||||
Recorder: fakeRecorder,
|
Recorder: fakeRecorder,
|
||||||
@ -363,7 +363,7 @@ func newTestKubeletWithImageList(
|
|||||||
ShutdownGracePeriodCriticalPods: 0,
|
ShutdownGracePeriodCriticalPods: 0,
|
||||||
})
|
})
|
||||||
kubelet.shutdownManager = shutdownManager
|
kubelet.shutdownManager = shutdownManager
|
||||||
kubelet.admitHandlers.AddPodAdmitHandler(shutdownAdmitHandler)
|
kubelet.admitHandlers.AddPodAdmitHandler(shutdownManager)
|
||||||
|
|
||||||
// Add this as cleanup predicate pod admitter
|
// Add this as cleanup predicate pod admitter
|
||||||
kubelet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(kubelet.getNodeAnyWay, lifecycle.NewAdmissionFailureHandlerStub(), kubelet.containerManager.UpdatePluginResources))
|
kubelet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(kubelet.getNodeAnyWay, lifecycle.NewAdmissionFailureHandlerStub(), kubelet.containerManager.UpdatePluginResources))
|
||||||
|
@ -17,11 +17,19 @@ limitations under the License.
|
|||||||
package nodeshutdown
|
package nodeshutdown
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||||
"k8s.io/client-go/tools/record"
|
"k8s.io/client-go/tools/record"
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
|
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||||
|
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||||
|
"k8s.io/kubernetes/pkg/features"
|
||||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
@ -32,6 +40,8 @@ import (
|
|||||||
|
|
||||||
// Manager interface provides methods for Kubelet to manage node shutdown.
|
// Manager interface provides methods for Kubelet to manage node shutdown.
|
||||||
type Manager interface {
|
type Manager interface {
|
||||||
|
lifecycle.PodAdmitHandler
|
||||||
|
|
||||||
Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult
|
Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult
|
||||||
Start() error
|
Start() error
|
||||||
ShutdownStatus() error
|
ShutdownStatus() error
|
||||||
@ -71,3 +81,211 @@ func (managerStub) Start() error {
|
|||||||
func (managerStub) ShutdownStatus() error {
|
func (managerStub) ShutdownStatus() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
nodeShutdownReason = "Terminated"
|
||||||
|
nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown."
|
||||||
|
)
|
||||||
|
|
||||||
|
// podManager is responsible for killing active pods by priority.
|
||||||
|
type podManager struct {
|
||||||
|
logger klog.Logger
|
||||||
|
shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
clock clock.Clock
|
||||||
|
killPodFunc eviction.KillPodFunc
|
||||||
|
volumeManager volumemanager.VolumeManager
|
||||||
|
}
|
||||||
|
|
||||||
|
func newPodManager(conf *Config) *podManager {
|
||||||
|
shutdownGracePeriodByPodPriority := conf.ShutdownGracePeriodByPodPriority
|
||||||
|
|
||||||
|
// Migration from the original configuration
|
||||||
|
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) ||
|
||||||
|
len(shutdownGracePeriodByPodPriority) == 0 {
|
||||||
|
shutdownGracePeriodByPodPriority = migrateConfig(conf.ShutdownGracePeriodRequested, conf.ShutdownGracePeriodCriticalPods)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by priority from low to high
|
||||||
|
sort.Slice(shutdownGracePeriodByPodPriority, func(i, j int) bool {
|
||||||
|
return shutdownGracePeriodByPodPriority[i].Priority < shutdownGracePeriodByPodPriority[j].Priority
|
||||||
|
})
|
||||||
|
|
||||||
|
if conf.Clock == nil {
|
||||||
|
conf.Clock = clock.RealClock{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return &podManager{
|
||||||
|
logger: conf.Logger,
|
||||||
|
shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority,
|
||||||
|
clock: conf.Clock,
|
||||||
|
killPodFunc: conf.KillPodFunc,
|
||||||
|
volumeManager: conf.VolumeManager,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// killPods terminates pods by priority.
|
||||||
|
func (m *podManager) killPods(activePods []*v1.Pod) error {
|
||||||
|
groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods)
|
||||||
|
for _, group := range groups {
|
||||||
|
// If there are no pods in a particular range,
|
||||||
|
// then do not wait for pods in that priority range.
|
||||||
|
if len(group.Pods) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(len(group.Pods))
|
||||||
|
for _, pod := range group.Pods {
|
||||||
|
go func(pod *v1.Pod, group podShutdownGroup) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
gracePeriodOverride := group.ShutdownGracePeriodSeconds
|
||||||
|
|
||||||
|
// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
|
||||||
|
if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
|
||||||
|
gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
|
||||||
|
}
|
||||||
|
|
||||||
|
m.logger.V(1).Info("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride)
|
||||||
|
|
||||||
|
if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) {
|
||||||
|
// set the pod status to failed (unless it was already in a successful terminal phase)
|
||||||
|
if status.Phase != v1.PodSucceeded {
|
||||||
|
status.Phase = v1.PodFailed
|
||||||
|
}
|
||||||
|
status.Message = nodeShutdownMessage
|
||||||
|
status.Reason = nodeShutdownReason
|
||||||
|
podutil.UpdatePodCondition(status, &v1.PodCondition{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: v1.PodReasonTerminationByKubelet,
|
||||||
|
Message: nodeShutdownMessage,
|
||||||
|
})
|
||||||
|
}); err != nil {
|
||||||
|
m.logger.V(1).Info("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err)
|
||||||
|
} else {
|
||||||
|
m.logger.V(1).Info("Shutdown manager finished killing pod", "pod", klog.KObj(pod))
|
||||||
|
}
|
||||||
|
}(pod, group)
|
||||||
|
}
|
||||||
|
|
||||||
|
// This duration determines how long the shutdown manager will wait for the pods in this group
|
||||||
|
// to terminate before proceeding to the next group.
|
||||||
|
var groupTerminationWaitDuration = time.Duration(group.ShutdownGracePeriodSeconds) * time.Second
|
||||||
|
var (
|
||||||
|
doneCh = make(chan struct{})
|
||||||
|
timer = m.clock.NewTimer(groupTerminationWaitDuration)
|
||||||
|
ctx, ctxCancel = context.WithTimeout(context.Background(), groupTerminationWaitDuration)
|
||||||
|
)
|
||||||
|
go func() {
|
||||||
|
defer close(doneCh)
|
||||||
|
defer ctxCancel()
|
||||||
|
wg.Wait()
|
||||||
|
// The signal to kill a Pod was sent successfully to all the pods,
|
||||||
|
// let's wait until all the volumes are unmounted from all the pods before
|
||||||
|
// continuing to the next group. This is done so that the CSI Driver (assuming
|
||||||
|
// that it's part of the highest group) has a chance to perform unmounts.
|
||||||
|
if err := m.volumeManager.WaitForAllPodsUnmount(ctx, group.Pods); err != nil {
|
||||||
|
var podIdentifiers []string
|
||||||
|
for _, pod := range group.Pods {
|
||||||
|
podIdentifiers = append(podIdentifiers, fmt.Sprintf("%s/%s", pod.Namespace, pod.Name))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Waiting for volume teardown is done on a best basis effort,
|
||||||
|
// report an error and continue.
|
||||||
|
//
|
||||||
|
// Depending on the user provided kubelet configuration value
|
||||||
|
// either the `timer` will tick and we'll continue to shutdown the next group, or,
|
||||||
|
// WaitForAllPodsUnmount will timeout, therefore this goroutine
|
||||||
|
// will close doneCh and we'll continue to shutdown the next group.
|
||||||
|
m.logger.Error(err, "Failed while waiting for all the volumes belonging to Pods in this group to unmount", "pods", podIdentifiers)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-doneCh:
|
||||||
|
timer.Stop()
|
||||||
|
m.logger.V(1).Info("Done waiting for all pods in group to terminate", "gracePeriod", group.ShutdownGracePeriodSeconds, "priority", group.Priority)
|
||||||
|
case <-timer.C():
|
||||||
|
ctxCancel()
|
||||||
|
m.logger.V(1).Info("Shutdown manager pod killing time out", "gracePeriod", group.ShutdownGracePeriodSeconds, "priority", group.Priority)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *podManager) periodRequested() time.Duration {
|
||||||
|
var sum int64
|
||||||
|
for _, period := range m.shutdownGracePeriodByPodPriority {
|
||||||
|
sum += period.ShutdownGracePeriodSeconds
|
||||||
|
}
|
||||||
|
return time.Duration(sum) * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
func migrateConfig(shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) []kubeletconfig.ShutdownGracePeriodByPodPriority {
|
||||||
|
if shutdownGracePeriodRequested == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
defaultPriority := shutdownGracePeriodRequested - shutdownGracePeriodCriticalPods
|
||||||
|
if defaultPriority < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
criticalPriority := shutdownGracePeriodRequested - defaultPriority
|
||||||
|
if criticalPriority < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: int64(defaultPriority / time.Second),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: int64(criticalPriority / time.Second),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type podShutdownGroup struct {
|
||||||
|
kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
Pods []*v1.Pod
|
||||||
|
}
|
||||||
|
|
||||||
|
func groupByPriority(shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority, pods []*v1.Pod) []podShutdownGroup {
|
||||||
|
groups := make([]podShutdownGroup, 0, len(shutdownGracePeriodByPodPriority))
|
||||||
|
for _, period := range shutdownGracePeriodByPodPriority {
|
||||||
|
groups = append(groups, podShutdownGroup{
|
||||||
|
ShutdownGracePeriodByPodPriority: period,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, pod := range pods {
|
||||||
|
var priority int32
|
||||||
|
if pod.Spec.Priority != nil {
|
||||||
|
priority = *pod.Spec.Priority
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the group index according to the priority.
|
||||||
|
index := sort.Search(len(groups), func(i int) bool {
|
||||||
|
return groups[i].Priority >= priority
|
||||||
|
})
|
||||||
|
|
||||||
|
// 1. Those higher than the highest priority default to the highest priority
|
||||||
|
// 2. Those lower than the lowest priority default to the lowest priority
|
||||||
|
// 3. Those boundary priority default to the lower priority
|
||||||
|
// if priority of pod is:
|
||||||
|
// groups[index-1].Priority <= pod priority < groups[index].Priority
|
||||||
|
// in which case we want to pick lower one (i.e index-1)
|
||||||
|
if index == len(groups) {
|
||||||
|
index = len(groups) - 1
|
||||||
|
} else if index < 0 {
|
||||||
|
index = 0
|
||||||
|
} else if index > 0 && groups[index].Priority > priority {
|
||||||
|
index--
|
||||||
|
}
|
||||||
|
|
||||||
|
groups[index].Pods = append(groups[index].Pods, pod)
|
||||||
|
}
|
||||||
|
return groups
|
||||||
|
}
|
||||||
|
@ -21,10 +21,8 @@ limitations under the License.
|
|||||||
package nodeshutdown
|
package nodeshutdown
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -32,23 +30,16 @@ import (
|
|||||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||||
"k8s.io/client-go/tools/record"
|
"k8s.io/client-go/tools/record"
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
|
|
||||||
"k8s.io/kubernetes/pkg/apis/scheduling"
|
|
||||||
"k8s.io/kubernetes/pkg/features"
|
"k8s.io/kubernetes/pkg/features"
|
||||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
|
||||||
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
|
kubeletevents "k8s.io/kubernetes/pkg/kubelet/events"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/prober"
|
"k8s.io/kubernetes/pkg/kubelet/prober"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/volumemanager"
|
|
||||||
"k8s.io/utils/clock"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
nodeShutdownReason = "Terminated"
|
|
||||||
nodeShutdownMessage = "Pod was terminated in response to imminent node shutdown."
|
|
||||||
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
nodeShutdownNotAdmittedReason = "NodeShutdown"
|
||||||
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
nodeShutdownNotAdmittedMessage = "Pod was rejected as the node is shutting down."
|
||||||
dbusReconnectPeriod = 1 * time.Second
|
dbusReconnectPeriod = 1 * time.Second
|
||||||
@ -75,12 +66,7 @@ type managerImpl struct {
|
|||||||
nodeRef *v1.ObjectReference
|
nodeRef *v1.ObjectReference
|
||||||
probeManager prober.Manager
|
probeManager prober.Manager
|
||||||
|
|
||||||
volumeManager volumemanager.VolumeManager
|
|
||||||
|
|
||||||
shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
|
||||||
|
|
||||||
getPods eviction.ActivePodsFunc
|
getPods eviction.ActivePodsFunc
|
||||||
killPodFunc eviction.KillPodFunc
|
|
||||||
syncNodeStatus func()
|
syncNodeStatus func()
|
||||||
|
|
||||||
dbusCon dbusInhibiter
|
dbusCon dbusInhibiter
|
||||||
@ -88,52 +74,35 @@ type managerImpl struct {
|
|||||||
|
|
||||||
nodeShuttingDownMutex sync.Mutex
|
nodeShuttingDownMutex sync.Mutex
|
||||||
nodeShuttingDownNow bool
|
nodeShuttingDownNow bool
|
||||||
|
podManager *podManager
|
||||||
clock clock.Clock
|
|
||||||
|
|
||||||
enableMetrics bool
|
enableMetrics bool
|
||||||
storage storage
|
storage storage
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewManager returns a new node shutdown manager.
|
// NewManager returns a new node shutdown manager.
|
||||||
func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
|
func NewManager(conf *Config) Manager {
|
||||||
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||||
m := managerStub{}
|
m := managerStub{}
|
||||||
return m, m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
shutdownGracePeriodByPodPriority := conf.ShutdownGracePeriodByPodPriority
|
podManager := newPodManager(conf)
|
||||||
// Migration from the original configuration
|
|
||||||
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority) ||
|
|
||||||
len(shutdownGracePeriodByPodPriority) == 0 {
|
|
||||||
shutdownGracePeriodByPodPriority = migrateConfig(conf.ShutdownGracePeriodRequested, conf.ShutdownGracePeriodCriticalPods)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Disable if the configuration is empty
|
// Disable if the configuration is empty
|
||||||
if len(shutdownGracePeriodByPodPriority) == 0 {
|
if len(podManager.shutdownGracePeriodByPodPriority) == 0 {
|
||||||
m := managerStub{}
|
m := managerStub{}
|
||||||
return m, m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort by priority from low to high
|
|
||||||
sort.Slice(shutdownGracePeriodByPodPriority, func(i, j int) bool {
|
|
||||||
return shutdownGracePeriodByPodPriority[i].Priority < shutdownGracePeriodByPodPriority[j].Priority
|
|
||||||
})
|
|
||||||
|
|
||||||
if conf.Clock == nil {
|
|
||||||
conf.Clock = clock.RealClock{}
|
|
||||||
}
|
|
||||||
manager := &managerImpl{
|
manager := &managerImpl{
|
||||||
logger: conf.Logger,
|
logger: conf.Logger,
|
||||||
probeManager: conf.ProbeManager,
|
probeManager: conf.ProbeManager,
|
||||||
recorder: conf.Recorder,
|
recorder: conf.Recorder,
|
||||||
volumeManager: conf.VolumeManager,
|
|
||||||
nodeRef: conf.NodeRef,
|
nodeRef: conf.NodeRef,
|
||||||
getPods: conf.GetPodsFunc,
|
getPods: conf.GetPodsFunc,
|
||||||
killPodFunc: conf.KillPodFunc,
|
|
||||||
syncNodeStatus: conf.SyncNodeStatusFunc,
|
syncNodeStatus: conf.SyncNodeStatusFunc,
|
||||||
shutdownGracePeriodByPodPriority: shutdownGracePeriodByPodPriority,
|
podManager: podManager,
|
||||||
clock: conf.Clock,
|
|
||||||
enableMetrics: utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority),
|
enableMetrics: utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdownBasedOnPodPriority),
|
||||||
storage: localStorage{
|
storage: localStorage{
|
||||||
Path: filepath.Join(conf.StateDirectory, localStorageStateFile),
|
Path: filepath.Join(conf.StateDirectory, localStorageStateFile),
|
||||||
@ -142,9 +111,9 @@ func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
|
|||||||
manager.logger.Info("Creating node shutdown manager",
|
manager.logger.Info("Creating node shutdown manager",
|
||||||
"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
|
"shutdownGracePeriodRequested", conf.ShutdownGracePeriodRequested,
|
||||||
"shutdownGracePeriodCriticalPods", conf.ShutdownGracePeriodCriticalPods,
|
"shutdownGracePeriodCriticalPods", conf.ShutdownGracePeriodCriticalPods,
|
||||||
"shutdownGracePeriodByPodPriority", shutdownGracePeriodByPodPriority,
|
"shutdownGracePeriodByPodPriority", podManager.shutdownGracePeriodByPodPriority,
|
||||||
)
|
)
|
||||||
return manager, manager
|
return manager
|
||||||
}
|
}
|
||||||
|
|
||||||
// Admit rejects all pods if node is shutting
|
// Admit rejects all pods if node is shutting
|
||||||
@ -217,7 +186,7 @@ func (m *managerImpl) start() (chan struct{}, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than periodRequested, attempt to update the value to periodRequested.
|
// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than periodRequested, attempt to update the value to periodRequested.
|
||||||
if periodRequested := m.periodRequested(); periodRequested > currentInhibitDelay {
|
if periodRequested := m.podManager.periodRequested(); periodRequested > currentInhibitDelay {
|
||||||
err := m.dbusCon.OverrideInhibitDelay(periodRequested)
|
err := m.dbusCon.OverrideInhibitDelay(periodRequested)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err)
|
return nil, fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err)
|
||||||
@ -356,166 +325,5 @@ func (m *managerImpl) processShutdownEvent() error {
|
|||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
groups := groupByPriority(m.shutdownGracePeriodByPodPriority, activePods)
|
return m.podManager.killPods(activePods)
|
||||||
for _, group := range groups {
|
|
||||||
// If there are no pods in a particular range,
|
|
||||||
// then do not wait for pods in that priority range.
|
|
||||||
if len(group.Pods) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
wg.Add(len(group.Pods))
|
|
||||||
for _, pod := range group.Pods {
|
|
||||||
go func(pod *v1.Pod, group podShutdownGroup) {
|
|
||||||
defer wg.Done()
|
|
||||||
|
|
||||||
gracePeriodOverride := group.ShutdownGracePeriodSeconds
|
|
||||||
|
|
||||||
// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
|
|
||||||
if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
|
|
||||||
gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
|
|
||||||
}
|
|
||||||
|
|
||||||
m.logger.V(1).Info("Shutdown manager killing pod with gracePeriod", "pod", klog.KObj(pod), "gracePeriod", gracePeriodOverride)
|
|
||||||
|
|
||||||
if err := m.killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) {
|
|
||||||
// set the pod status to failed (unless it was already in a successful terminal phase)
|
|
||||||
if status.Phase != v1.PodSucceeded {
|
|
||||||
status.Phase = v1.PodFailed
|
|
||||||
}
|
|
||||||
status.Message = nodeShutdownMessage
|
|
||||||
status.Reason = nodeShutdownReason
|
|
||||||
podutil.UpdatePodCondition(status, &v1.PodCondition{
|
|
||||||
Type: v1.DisruptionTarget,
|
|
||||||
Status: v1.ConditionTrue,
|
|
||||||
Reason: v1.PodReasonTerminationByKubelet,
|
|
||||||
Message: nodeShutdownMessage,
|
|
||||||
})
|
|
||||||
}); err != nil {
|
|
||||||
m.logger.V(1).Info("Shutdown manager failed killing pod", "pod", klog.KObj(pod), "err", err)
|
|
||||||
} else {
|
|
||||||
m.logger.V(1).Info("Shutdown manager finished killing pod", "pod", klog.KObj(pod))
|
|
||||||
}
|
|
||||||
}(pod, group)
|
|
||||||
}
|
|
||||||
|
|
||||||
// This duration determines how long the shutdown manager will wait for the pods in this group
|
|
||||||
// to terminate before proceeding to the next group.
|
|
||||||
var groupTerminationWaitDuration = time.Duration(group.ShutdownGracePeriodSeconds) * time.Second
|
|
||||||
var (
|
|
||||||
doneCh = make(chan struct{})
|
|
||||||
timer = m.clock.NewTimer(groupTerminationWaitDuration)
|
|
||||||
ctx, ctxCancel = context.WithTimeout(context.Background(), groupTerminationWaitDuration)
|
|
||||||
)
|
|
||||||
go func() {
|
|
||||||
defer close(doneCh)
|
|
||||||
defer ctxCancel()
|
|
||||||
wg.Wait()
|
|
||||||
// The signal to kill a Pod was sent successfully to all the pods,
|
|
||||||
// let's wait until all the volumes are unmounted from all the pods before
|
|
||||||
// continuing to the next group. This is done so that the CSI Driver (assuming
|
|
||||||
// that it's part of the highest group) has a chance to perform unmounts.
|
|
||||||
if err := m.volumeManager.WaitForAllPodsUnmount(ctx, group.Pods); err != nil {
|
|
||||||
var podIdentifiers []string
|
|
||||||
for _, pod := range group.Pods {
|
|
||||||
podIdentifiers = append(podIdentifiers, fmt.Sprintf("%s/%s", pod.Namespace, pod.Name))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Waiting for volume teardown is done on a best basis effort,
|
|
||||||
// report an error and continue.
|
|
||||||
//
|
|
||||||
// Depending on the user provided kubelet configuration value
|
|
||||||
// either the `timer` will tick and we'll continue to shutdown the next group, or,
|
|
||||||
// WaitForAllPodsUnmount will timeout, therefore this goroutine
|
|
||||||
// will close doneCh and we'll continue to shutdown the next group.
|
|
||||||
m.logger.Error(err, "Failed while waiting for all the volumes belonging to Pods in this group to unmount", "pods", podIdentifiers)
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-doneCh:
|
|
||||||
timer.Stop()
|
|
||||||
case <-timer.C():
|
|
||||||
ctxCancel()
|
|
||||||
m.logger.V(1).Info("Shutdown manager pod killing time out", "gracePeriod", group.ShutdownGracePeriodSeconds, "priority", group.Priority)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *managerImpl) periodRequested() time.Duration {
|
|
||||||
var sum int64
|
|
||||||
for _, period := range m.shutdownGracePeriodByPodPriority {
|
|
||||||
sum += period.ShutdownGracePeriodSeconds
|
|
||||||
}
|
|
||||||
return time.Duration(sum) * time.Second
|
|
||||||
}
|
|
||||||
|
|
||||||
func migrateConfig(shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) []kubeletconfig.ShutdownGracePeriodByPodPriority {
|
|
||||||
if shutdownGracePeriodRequested == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
defaultPriority := shutdownGracePeriodRequested - shutdownGracePeriodCriticalPods
|
|
||||||
if defaultPriority < 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
criticalPriority := shutdownGracePeriodRequested - defaultPriority
|
|
||||||
if criticalPriority < 0 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
{
|
|
||||||
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
|
||||||
ShutdownGracePeriodSeconds: int64(defaultPriority / time.Second),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Priority: scheduling.SystemCriticalPriority,
|
|
||||||
ShutdownGracePeriodSeconds: int64(criticalPriority / time.Second),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func groupByPriority(shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority, pods []*v1.Pod) []podShutdownGroup {
|
|
||||||
groups := make([]podShutdownGroup, 0, len(shutdownGracePeriodByPodPriority))
|
|
||||||
for _, period := range shutdownGracePeriodByPodPriority {
|
|
||||||
groups = append(groups, podShutdownGroup{
|
|
||||||
ShutdownGracePeriodByPodPriority: period,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, pod := range pods {
|
|
||||||
var priority int32
|
|
||||||
if pod.Spec.Priority != nil {
|
|
||||||
priority = *pod.Spec.Priority
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the group index according to the priority.
|
|
||||||
index := sort.Search(len(groups), func(i int) bool {
|
|
||||||
return groups[i].Priority >= priority
|
|
||||||
})
|
|
||||||
|
|
||||||
// 1. Those higher than the highest priority default to the highest priority
|
|
||||||
// 2. Those lower than the lowest priority default to the lowest priority
|
|
||||||
// 3. Those boundary priority default to the lower priority
|
|
||||||
// if priority of pod is:
|
|
||||||
// groups[index-1].Priority <= pod priority < groups[index].Priority
|
|
||||||
// in which case we want to pick lower one (i.e index-1)
|
|
||||||
if index == len(groups) {
|
|
||||||
index = len(groups) - 1
|
|
||||||
} else if index < 0 {
|
|
||||||
index = 0
|
|
||||||
} else if index > 0 && groups[index].Priority > priority {
|
|
||||||
index--
|
|
||||||
}
|
|
||||||
|
|
||||||
groups[index].Pods = append(groups[index].Pods, pod)
|
|
||||||
}
|
|
||||||
return groups
|
|
||||||
}
|
|
||||||
|
|
||||||
type podShutdownGroup struct {
|
|
||||||
kubeletconfig.ShutdownGracePeriodByPodPriority
|
|
||||||
Pods []*v1.Pod
|
|
||||||
}
|
}
|
||||||
|
@ -92,19 +92,6 @@ func (f *fakeDbus) OverrideInhibitDelay(inhibitDelayMax time.Duration) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func makePod(name string, priority int32, terminationGracePeriod *int64) *v1.Pod {
|
|
||||||
return &v1.Pod{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: name,
|
|
||||||
UID: types.UID(name),
|
|
||||||
},
|
|
||||||
Spec: v1.PodSpec{
|
|
||||||
Priority: &priority,
|
|
||||||
TerminationGracePeriodSeconds: terminationGracePeriod,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestManager(t *testing.T) {
|
func TestManager(t *testing.T) {
|
||||||
systemDbusTmp := systemDbus
|
systemDbusTmp := systemDbus
|
||||||
defer func() {
|
defer func() {
|
||||||
@ -352,7 +339,7 @@ func TestManager(t *testing.T) {
|
|||||||
fakeRecorder := &record.FakeRecorder{}
|
fakeRecorder := &record.FakeRecorder{}
|
||||||
fakeVolumeManager := volumemanager.NewFakeVolumeManager([]v1.UniqueVolumeName{}, 0, nil)
|
fakeVolumeManager := volumemanager.NewFakeVolumeManager([]v1.UniqueVolumeName{}, 0, nil)
|
||||||
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
||||||
manager, _ := NewManager(&Config{
|
manager := NewManager(&Config{
|
||||||
Logger: logger,
|
Logger: logger,
|
||||||
ProbeManager: proberManager,
|
ProbeManager: proberManager,
|
||||||
VolumeManager: fakeVolumeManager,
|
VolumeManager: fakeVolumeManager,
|
||||||
@ -459,7 +446,7 @@ func TestFeatureEnabled(t *testing.T) {
|
|||||||
fakeVolumeManager := volumemanager.NewFakeVolumeManager([]v1.UniqueVolumeName{}, 0, nil)
|
fakeVolumeManager := volumemanager.NewFakeVolumeManager([]v1.UniqueVolumeName{}, 0, nil)
|
||||||
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
||||||
|
|
||||||
manager, _ := NewManager(&Config{
|
manager := NewManager(&Config{
|
||||||
Logger: logger,
|
Logger: logger,
|
||||||
ProbeManager: proberManager,
|
ProbeManager: proberManager,
|
||||||
VolumeManager: fakeVolumeManager,
|
VolumeManager: fakeVolumeManager,
|
||||||
@ -517,7 +504,7 @@ func TestRestart(t *testing.T) {
|
|||||||
fakeRecorder := &record.FakeRecorder{}
|
fakeRecorder := &record.FakeRecorder{}
|
||||||
fakeVolumeManager := volumemanager.NewFakeVolumeManager([]v1.UniqueVolumeName{}, 0, nil)
|
fakeVolumeManager := volumemanager.NewFakeVolumeManager([]v1.UniqueVolumeName{}, 0, nil)
|
||||||
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
||||||
manager, _ := NewManager(&Config{
|
manager := NewManager(&Config{
|
||||||
Logger: logger,
|
Logger: logger,
|
||||||
ProbeManager: proberManager,
|
ProbeManager: proberManager,
|
||||||
VolumeManager: fakeVolumeManager,
|
VolumeManager: fakeVolumeManager,
|
||||||
@ -551,199 +538,6 @@ func TestRestart(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func Test_migrateConfig(t *testing.T) {
|
|
||||||
type shutdownConfig struct {
|
|
||||||
shutdownGracePeriodRequested time.Duration
|
|
||||||
shutdownGracePeriodCriticalPods time.Duration
|
|
||||||
}
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
args shutdownConfig
|
|
||||||
want []kubeletconfig.ShutdownGracePeriodByPodPriority
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "both shutdownGracePeriodRequested and shutdownGracePeriodCriticalPods",
|
|
||||||
args: shutdownConfig{
|
|
||||||
shutdownGracePeriodRequested: 300 * time.Second,
|
|
||||||
shutdownGracePeriodCriticalPods: 120 * time.Second,
|
|
||||||
},
|
|
||||||
want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
{
|
|
||||||
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
|
||||||
ShutdownGracePeriodSeconds: 180,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Priority: scheduling.SystemCriticalPriority,
|
|
||||||
ShutdownGracePeriodSeconds: 120,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "only shutdownGracePeriodRequested",
|
|
||||||
args: shutdownConfig{
|
|
||||||
shutdownGracePeriodRequested: 100 * time.Second,
|
|
||||||
shutdownGracePeriodCriticalPods: 0 * time.Second,
|
|
||||||
},
|
|
||||||
want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
{
|
|
||||||
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
|
||||||
ShutdownGracePeriodSeconds: 100,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Priority: scheduling.SystemCriticalPriority,
|
|
||||||
ShutdownGracePeriodSeconds: 0,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "empty configuration",
|
|
||||||
args: shutdownConfig{
|
|
||||||
shutdownGracePeriodRequested: 0 * time.Second,
|
|
||||||
shutdownGracePeriodCriticalPods: 0 * time.Second,
|
|
||||||
},
|
|
||||||
want: nil,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "wrong configuration",
|
|
||||||
args: shutdownConfig{
|
|
||||||
shutdownGracePeriodRequested: 1 * time.Second,
|
|
||||||
shutdownGracePeriodCriticalPods: 100 * time.Second,
|
|
||||||
},
|
|
||||||
want: nil,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
if got := migrateConfig(tt.args.shutdownGracePeriodRequested, tt.args.shutdownGracePeriodCriticalPods); !assert.Equal(t, tt.want, got) {
|
|
||||||
t.Errorf("migrateConfig() = %v, want %v", got, tt.want)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func Test_groupByPriority(t *testing.T) {
|
|
||||||
type args struct {
|
|
||||||
shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
|
||||||
pods []*v1.Pod
|
|
||||||
}
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
args args
|
|
||||||
want []podShutdownGroup
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "migrate config",
|
|
||||||
args: args{
|
|
||||||
shutdownGracePeriodByPodPriority: migrateConfig(300*time.Second /* shutdownGracePeriodRequested */, 120*time.Second /* shutdownGracePeriodCriticalPods */),
|
|
||||||
pods: []*v1.Pod{
|
|
||||||
makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
|
|
||||||
makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
|
|
||||||
makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
want: []podShutdownGroup{
|
|
||||||
{
|
|
||||||
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
|
||||||
ShutdownGracePeriodSeconds: 180,
|
|
||||||
},
|
|
||||||
Pods: []*v1.Pod{
|
|
||||||
makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
|
|
||||||
makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
Priority: scheduling.SystemCriticalPriority,
|
|
||||||
ShutdownGracePeriodSeconds: 120,
|
|
||||||
},
|
|
||||||
Pods: []*v1.Pod{
|
|
||||||
makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "pod priority",
|
|
||||||
args: args{
|
|
||||||
shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
{
|
|
||||||
Priority: 1,
|
|
||||||
ShutdownGracePeriodSeconds: 10,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Priority: 2,
|
|
||||||
ShutdownGracePeriodSeconds: 20,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Priority: 3,
|
|
||||||
ShutdownGracePeriodSeconds: 30,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Priority: 4,
|
|
||||||
ShutdownGracePeriodSeconds: 40,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
pods: []*v1.Pod{
|
|
||||||
makePod("pod-0", 0, nil),
|
|
||||||
makePod("pod-1", 1, nil),
|
|
||||||
makePod("pod-2", 2, nil),
|
|
||||||
makePod("pod-3", 3, nil),
|
|
||||||
makePod("pod-4", 4, nil),
|
|
||||||
makePod("pod-5", 5, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
want: []podShutdownGroup{
|
|
||||||
{
|
|
||||||
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
Priority: 1,
|
|
||||||
ShutdownGracePeriodSeconds: 10,
|
|
||||||
},
|
|
||||||
Pods: []*v1.Pod{
|
|
||||||
makePod("pod-0", 0, nil),
|
|
||||||
makePod("pod-1", 1, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
Priority: 2,
|
|
||||||
ShutdownGracePeriodSeconds: 20,
|
|
||||||
},
|
|
||||||
Pods: []*v1.Pod{
|
|
||||||
makePod("pod-2", 2, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
Priority: 3,
|
|
||||||
ShutdownGracePeriodSeconds: 30,
|
|
||||||
},
|
|
||||||
Pods: []*v1.Pod{
|
|
||||||
makePod("pod-3", 3, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
|
||||||
Priority: 4,
|
|
||||||
ShutdownGracePeriodSeconds: 40,
|
|
||||||
},
|
|
||||||
Pods: []*v1.Pod{
|
|
||||||
makePod("pod-4", 4, nil),
|
|
||||||
makePod("pod-5", 5, nil),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
if got := groupByPriority(tt.args.shutdownGracePeriodByPodPriority, tt.args.pods); !assert.Equal(t, tt.want, got) {
|
|
||||||
t.Errorf("groupByPriority() = %v, want %v", got, tt.want)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func Test_managerImpl_processShutdownEvent(t *testing.T) {
|
func Test_managerImpl_processShutdownEvent(t *testing.T) {
|
||||||
var (
|
var (
|
||||||
probeManager = probetest.FakeManager{}
|
probeManager = probetest.FakeManager{}
|
||||||
@ -819,19 +613,22 @@ func Test_managerImpl_processShutdownEvent(t *testing.T) {
|
|||||||
)
|
)
|
||||||
m := &managerImpl{
|
m := &managerImpl{
|
||||||
logger: logger,
|
logger: logger,
|
||||||
volumeManager: tt.fields.volumeManager,
|
|
||||||
recorder: tt.fields.recorder,
|
recorder: tt.fields.recorder,
|
||||||
nodeRef: tt.fields.nodeRef,
|
nodeRef: tt.fields.nodeRef,
|
||||||
probeManager: tt.fields.probeManager,
|
probeManager: tt.fields.probeManager,
|
||||||
shutdownGracePeriodByPodPriority: tt.fields.shutdownGracePeriodByPodPriority,
|
|
||||||
getPods: tt.fields.getPods,
|
getPods: tt.fields.getPods,
|
||||||
killPodFunc: tt.fields.killPodFunc,
|
|
||||||
syncNodeStatus: tt.fields.syncNodeStatus,
|
syncNodeStatus: tt.fields.syncNodeStatus,
|
||||||
dbusCon: tt.fields.dbusCon,
|
dbusCon: tt.fields.dbusCon,
|
||||||
inhibitLock: tt.fields.inhibitLock,
|
inhibitLock: tt.fields.inhibitLock,
|
||||||
nodeShuttingDownMutex: sync.Mutex{},
|
nodeShuttingDownMutex: sync.Mutex{},
|
||||||
nodeShuttingDownNow: tt.fields.nodeShuttingDownNow,
|
nodeShuttingDownNow: tt.fields.nodeShuttingDownNow,
|
||||||
|
podManager: &podManager{
|
||||||
|
logger: logger,
|
||||||
|
volumeManager: tt.fields.volumeManager,
|
||||||
|
shutdownGracePeriodByPodPriority: tt.fields.shutdownGracePeriodByPodPriority,
|
||||||
|
killPodFunc: tt.fields.killPodFunc,
|
||||||
clock: tt.fields.clock,
|
clock: tt.fields.clock,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
if err := m.processShutdownEvent(); (err != nil) != tt.wantErr {
|
if err := m.processShutdownEvent(); (err != nil) != tt.wantErr {
|
||||||
t.Errorf("managerImpl.processShutdownEvent() error = %v, wantErr %v", err, tt.wantErr)
|
t.Errorf("managerImpl.processShutdownEvent() error = %v, wantErr %v", err, tt.wantErr)
|
||||||
@ -871,27 +668,30 @@ func Test_processShutdownEvent_VolumeUnmountTimeout(t *testing.T) {
|
|||||||
logger := ktesting.NewLogger(t, ktesting.NewConfig(ktesting.BufferLogs(true)))
|
logger := ktesting.NewLogger(t, ktesting.NewConfig(ktesting.BufferLogs(true)))
|
||||||
m := &managerImpl{
|
m := &managerImpl{
|
||||||
logger: logger,
|
logger: logger,
|
||||||
volumeManager: fakeVolumeManager,
|
|
||||||
recorder: fakeRecorder,
|
recorder: fakeRecorder,
|
||||||
nodeRef: nodeRef,
|
nodeRef: nodeRef,
|
||||||
probeManager: probeManager,
|
probeManager: probeManager,
|
||||||
|
getPods: func() []*v1.Pod {
|
||||||
|
return []*v1.Pod{
|
||||||
|
makePod("test-pod", 1, nil),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
syncNodeStatus: syncNodeStatus,
|
||||||
|
dbusCon: &fakeDbus{},
|
||||||
|
podManager: &podManager{
|
||||||
|
logger: logger,
|
||||||
|
volumeManager: fakeVolumeManager,
|
||||||
shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
{
|
{
|
||||||
Priority: 1,
|
Priority: 1,
|
||||||
ShutdownGracePeriodSeconds: int64(shutdownGracePeriodSeconds),
|
ShutdownGracePeriodSeconds: int64(shutdownGracePeriodSeconds),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
getPods: func() []*v1.Pod {
|
|
||||||
return []*v1.Pod{
|
|
||||||
makePod("test-pod", 1, nil),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
killPodFunc: func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, fn func(*v1.PodStatus)) error {
|
killPodFunc: func(pod *v1.Pod, isEvicted bool, gracePeriodOverride *int64, fn func(*v1.PodStatus)) error {
|
||||||
return nil
|
return nil
|
||||||
},
|
},
|
||||||
syncNodeStatus: syncNodeStatus,
|
|
||||||
dbusCon: &fakeDbus{},
|
|
||||||
clock: fakeclock,
|
clock: fakeclock,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
start := fakeclock.Now()
|
start := fakeclock.Now()
|
||||||
|
@ -19,12 +19,8 @@ limitations under the License.
|
|||||||
|
|
||||||
package nodeshutdown
|
package nodeshutdown
|
||||||
|
|
||||||
import (
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
|
||||||
)
|
|
||||||
|
|
||||||
// NewManager returns a fake node shutdown manager for non linux platforms.
|
// NewManager returns a fake node shutdown manager for non linux platforms.
|
||||||
func NewManager(conf *Config) (Manager, lifecycle.PodAdmitHandler) {
|
func NewManager(conf *Config) Manager {
|
||||||
m := managerStub{}
|
m := managerStub{}
|
||||||
return m, m
|
return m
|
||||||
}
|
}
|
||||||
|
235
pkg/kubelet/nodeshutdown/nodeshutdown_manager_test.go
Normal file
235
pkg/kubelet/nodeshutdown/nodeshutdown_manager_test.go
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nodeshutdown
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
v1 "k8s.io/api/core/v1"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/types"
|
||||||
|
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||||
|
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
func makePod(name string, priority int32, terminationGracePeriod *int64) *v1.Pod {
|
||||||
|
return &v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: name,
|
||||||
|
UID: types.UID(name),
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Priority: &priority,
|
||||||
|
TerminationGracePeriodSeconds: terminationGracePeriod,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_migrateConfig(t *testing.T) {
|
||||||
|
type shutdownConfig struct {
|
||||||
|
shutdownGracePeriodRequested time.Duration
|
||||||
|
shutdownGracePeriodCriticalPods time.Duration
|
||||||
|
}
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
args shutdownConfig
|
||||||
|
want []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "both shutdownGracePeriodRequested and shutdownGracePeriodCriticalPods",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 300 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 120 * time.Second,
|
||||||
|
},
|
||||||
|
want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: 180,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: 120,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "only shutdownGracePeriodRequested",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 100 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 0 * time.Second,
|
||||||
|
},
|
||||||
|
want: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "empty configuration",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 0 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 0 * time.Second,
|
||||||
|
},
|
||||||
|
want: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "wrong configuration",
|
||||||
|
args: shutdownConfig{
|
||||||
|
shutdownGracePeriodRequested: 1 * time.Second,
|
||||||
|
shutdownGracePeriodCriticalPods: 100 * time.Second,
|
||||||
|
},
|
||||||
|
want: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := migrateConfig(tt.args.shutdownGracePeriodRequested, tt.args.shutdownGracePeriodCriticalPods); !assert.Equal(t, tt.want, got) {
|
||||||
|
t.Errorf("migrateConfig() = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Test_groupByPriority(t *testing.T) {
|
||||||
|
type args struct {
|
||||||
|
shutdownGracePeriodByPodPriority []kubeletconfig.ShutdownGracePeriodByPodPriority
|
||||||
|
pods []*v1.Pod
|
||||||
|
}
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
args args
|
||||||
|
want []podShutdownGroup
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "migrate config",
|
||||||
|
args: args{
|
||||||
|
shutdownGracePeriodByPodPriority: migrateConfig(300*time.Second /* shutdownGracePeriodRequested */, 120*time.Second /* shutdownGracePeriodCriticalPods */),
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
|
||||||
|
makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
|
||||||
|
makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
want: []podShutdownGroup{
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: scheduling.DefaultPriorityWhenNoDefaultClassExists,
|
||||||
|
ShutdownGracePeriodSeconds: 180,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("normal-pod", scheduling.DefaultPriorityWhenNoDefaultClassExists, nil),
|
||||||
|
makePod("highest-user-definable-pod", scheduling.HighestUserDefinablePriority, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: scheduling.SystemCriticalPriority,
|
||||||
|
ShutdownGracePeriodSeconds: 120,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("critical-pod", scheduling.SystemCriticalPriority, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "pod priority",
|
||||||
|
args: args{
|
||||||
|
shutdownGracePeriodByPodPriority: []kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
{
|
||||||
|
Priority: 1,
|
||||||
|
ShutdownGracePeriodSeconds: 10,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: 2,
|
||||||
|
ShutdownGracePeriodSeconds: 20,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: 3,
|
||||||
|
ShutdownGracePeriodSeconds: 30,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Priority: 4,
|
||||||
|
ShutdownGracePeriodSeconds: 40,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
makePod("pod-0", 0, nil),
|
||||||
|
makePod("pod-1", 1, nil),
|
||||||
|
makePod("pod-2", 2, nil),
|
||||||
|
makePod("pod-3", 3, nil),
|
||||||
|
makePod("pod-4", 4, nil),
|
||||||
|
makePod("pod-5", 5, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
want: []podShutdownGroup{
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 1,
|
||||||
|
ShutdownGracePeriodSeconds: 10,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-0", 0, nil),
|
||||||
|
makePod("pod-1", 1, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 2,
|
||||||
|
ShutdownGracePeriodSeconds: 20,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-2", 2, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 3,
|
||||||
|
ShutdownGracePeriodSeconds: 30,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-3", 3, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
ShutdownGracePeriodByPodPriority: kubeletconfig.ShutdownGracePeriodByPodPriority{
|
||||||
|
Priority: 4,
|
||||||
|
ShutdownGracePeriodSeconds: 40,
|
||||||
|
},
|
||||||
|
Pods: []*v1.Pod{
|
||||||
|
makePod("pod-4", 4, nil),
|
||||||
|
makePod("pod-5", 5, nil),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
if got := groupByPriority(tt.args.shutdownGracePeriodByPodPriority, tt.args.pods); !assert.Equal(t, tt.want, got) {
|
||||||
|
t.Errorf("groupByPriority() = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user