mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Implement shutdown manager in kubelet
Implements KEP 2000, Graceful Node Shutdown: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2000-graceful-node-shutdown * Add new FeatureGate `GracefulNodeShutdown` to control enabling/disabling the feature * Add two new KubeletConfiguration options * `ShutdownGracePeriod` and `ShutdownGracePeriodCriticalPods` * Add new package, `nodeshutdown` that implements the Node shutdown manager * The node shutdown manager uses the systemd inhibit package, to create an system inhibitor, monitor for node shutdown events, and gracefully terminate pods upon a node shutdown.
This commit is contained in:
parent
2343689ce7
commit
16f71c6d47
@ -701,6 +701,11 @@ const (
|
||||
// Enable kubelet to pass pod's service account token to NodePublishVolume
|
||||
// call of CSI driver which is mounting volumes for that pod.
|
||||
CSIServiceAccountToken featuregate.Feature = "CSIServiceAccountToken"
|
||||
|
||||
// owner: @bobbypage
|
||||
// alpha: v1.20
|
||||
// Adds support for kubelet to detect node shutdown and gracefully terminate pods prior to the node being shutdown.
|
||||
GracefulNodeShutdown featuregate.Feature = "GracefulNodeShutdown"
|
||||
)
|
||||
|
||||
func init() {
|
||||
@ -806,6 +811,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
|
||||
LoadBalancerIPMode: {Default: false, PreRelease: featuregate.Alpha},
|
||||
ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default in v1.21 and remove in v1.22
|
||||
KubeletCredentialProviders: {Default: false, PreRelease: featuregate.Alpha},
|
||||
GracefulNodeShutdown: {Default: false, PreRelease: featuregate.Alpha},
|
||||
|
||||
// inherited features from generic apiserver, relisted here to get a conflict if it is changed
|
||||
// unintentionally on either side:
|
||||
|
@ -71,6 +71,7 @@ go_library(
|
||||
"//pkg/kubelet/metrics:go_default_library",
|
||||
"//pkg/kubelet/metrics/collectors:go_default_library",
|
||||
"//pkg/kubelet/network/dns:go_default_library",
|
||||
"//pkg/kubelet/nodeshutdown:go_default_library",
|
||||
"//pkg/kubelet/nodestatus:go_default_library",
|
||||
"//pkg/kubelet/oom:go_default_library",
|
||||
"//pkg/kubelet/pleg:go_default_library",
|
||||
@ -308,7 +309,7 @@ filegroup(
|
||||
"//pkg/kubelet/logs:all-srcs",
|
||||
"//pkg/kubelet/metrics:all-srcs",
|
||||
"//pkg/kubelet/network:all-srcs",
|
||||
"//pkg/kubelet/nodeshutdown/systemd:all-srcs",
|
||||
"//pkg/kubelet/nodeshutdown:all-srcs",
|
||||
"//pkg/kubelet/nodestatus:all-srcs",
|
||||
"//pkg/kubelet/oom:all-srcs",
|
||||
"//pkg/kubelet/pleg:all-srcs",
|
||||
|
@ -234,5 +234,7 @@ var (
|
||||
"TypeMeta.Kind",
|
||||
"VolumeStatsAggPeriod.Duration",
|
||||
"VolumePluginDir",
|
||||
"ShutdownGracePeriod.Duration",
|
||||
"ShutdownGracePeriodCriticalPods.Duration",
|
||||
)
|
||||
)
|
||||
|
@ -67,6 +67,8 @@ registryPullQPS: 5
|
||||
resolvConf: /etc/resolv.conf
|
||||
runtimeRequestTimeout: 2m0s
|
||||
serializeImagePulls: true
|
||||
shutdownGracePeriod: 0s
|
||||
shutdownGracePeriodCriticalPods: 0s
|
||||
streamingConnectionIdleTimeout: 4h0m0s
|
||||
syncFrequency: 1m0s
|
||||
topologyManagerPolicy: none
|
||||
|
@ -67,6 +67,8 @@ registryPullQPS: 5
|
||||
resolvConf: /etc/resolv.conf
|
||||
runtimeRequestTimeout: 2m0s
|
||||
serializeImagePulls: true
|
||||
shutdownGracePeriod: 0s
|
||||
shutdownGracePeriodCriticalPods: 0s
|
||||
streamingConnectionIdleTimeout: 4h0m0s
|
||||
syncFrequency: 1m0s
|
||||
topologyManagerPolicy: none
|
||||
|
@ -375,6 +375,13 @@ type KubeletConfiguration struct {
|
||||
Logging componentbaseconfig.LoggingConfiguration
|
||||
// EnableSystemLogHandler enables /logs handler.
|
||||
EnableSystemLogHandler bool
|
||||
// ShutdownGracePeriod specifies the total duration that the node should delay the shutdown and total grace period for pod termination during a node shutdown.
|
||||
// Defaults to 30 seconds, requires GracefulNodeShutdown feature gate to be enabled.
|
||||
ShutdownGracePeriod metav1.Duration
|
||||
// ShutdownGracePeriodCriticalPods specifies the duration used to terminate critical pods during a node shutdown. This should be less than ShutdownGracePeriod.
|
||||
// Defaults to 10 seconds, requires GracefulNodeShutdown feature gate to be enabled.
|
||||
// For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods.
|
||||
ShutdownGracePeriodCriticalPods metav1.Duration
|
||||
}
|
||||
|
||||
// KubeletAuthorizationMode denotes the authorization mode for the kubelet
|
||||
|
@ -350,6 +350,8 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
|
||||
if err := v1.Convert_Pointer_bool_To_bool(&in.EnableSystemLogHandler, &out.EnableSystemLogHandler, s); err != nil {
|
||||
return err
|
||||
}
|
||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -501,6 +503,8 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
|
||||
if err := v1.Convert_bool_To_Pointer_bool(&in.EnableSystemLogHandler, &out.EnableSystemLogHandler, s); err != nil {
|
||||
return err
|
||||
}
|
||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -140,6 +140,21 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error
|
||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: topologyManagerScope non-allowable value: %v", kc.TopologyManagerScope))
|
||||
}
|
||||
|
||||
if localFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||
if kc.ShutdownGracePeriod.Duration < 0 || kc.ShutdownGracePeriodCriticalPods.Duration < 0 || kc.ShutdownGracePeriodCriticalPods.Duration > kc.ShutdownGracePeriod.Duration {
|
||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriod %v must be >= 0, ShutdownGracePeriodCriticalPods %v must be >= 0, and ShutdownGracePeriodCriticalPods %v must be <= ShutdownGracePeriod %v", kc.ShutdownGracePeriod, kc.ShutdownGracePeriodCriticalPods, kc.ShutdownGracePeriodCriticalPods, kc.ShutdownGracePeriod))
|
||||
}
|
||||
if kc.ShutdownGracePeriod.Duration > 0 && kc.ShutdownGracePeriod.Duration < time.Duration(time.Second) {
|
||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriod %v must be either zero or otherwise >= 1 sec", kc.ShutdownGracePeriod))
|
||||
}
|
||||
if kc.ShutdownGracePeriodCriticalPods.Duration > 0 && kc.ShutdownGracePeriodCriticalPods.Duration < time.Duration(time.Second) {
|
||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriodCriticalPods %v must be either zero or otherwise >= 1 sec", kc.ShutdownGracePeriodCriticalPods))
|
||||
}
|
||||
}
|
||||
if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown"))
|
||||
}
|
||||
|
||||
for _, val := range kc.EnforceNodeAllocatable {
|
||||
switch val {
|
||||
case kubetypes.NodeAllocatableEnforcementKey:
|
||||
|
@ -27,36 +27,39 @@ import (
|
||||
|
||||
func TestValidateKubeletConfiguration(t *testing.T) {
|
||||
successCase1 := &kubeletconfig.KubeletConfiguration{
|
||||
CgroupsPerQOS: true,
|
||||
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
|
||||
SystemReservedCgroup: "/system.slice",
|
||||
KubeReservedCgroup: "/kubelet.service",
|
||||
SystemCgroups: "",
|
||||
CgroupRoot: "",
|
||||
EventBurst: 10,
|
||||
EventRecordQPS: 5,
|
||||
HealthzPort: 10248,
|
||||
ImageGCHighThresholdPercent: 85,
|
||||
ImageGCLowThresholdPercent: 80,
|
||||
IPTablesDropBit: 15,
|
||||
IPTablesMasqueradeBit: 14,
|
||||
KubeAPIBurst: 10,
|
||||
KubeAPIQPS: 5,
|
||||
MaxOpenFiles: 1000000,
|
||||
MaxPods: 110,
|
||||
OOMScoreAdj: -999,
|
||||
PodsPerCore: 100,
|
||||
Port: 65535,
|
||||
ReadOnlyPort: 0,
|
||||
RegistryBurst: 10,
|
||||
RegistryPullQPS: 5,
|
||||
HairpinMode: kubeletconfig.PromiscuousBridge,
|
||||
NodeLeaseDurationSeconds: 1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 25 * time.Millisecond},
|
||||
TopologyManagerScope: kubeletconfig.PodTopologyManagerScope,
|
||||
TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
|
||||
CgroupsPerQOS: true,
|
||||
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
|
||||
SystemReservedCgroup: "/system.slice",
|
||||
KubeReservedCgroup: "/kubelet.service",
|
||||
SystemCgroups: "",
|
||||
CgroupRoot: "",
|
||||
EventBurst: 10,
|
||||
EventRecordQPS: 5,
|
||||
HealthzPort: 10248,
|
||||
ImageGCHighThresholdPercent: 85,
|
||||
ImageGCLowThresholdPercent: 80,
|
||||
IPTablesDropBit: 15,
|
||||
IPTablesMasqueradeBit: 14,
|
||||
KubeAPIBurst: 10,
|
||||
KubeAPIQPS: 5,
|
||||
MaxOpenFiles: 1000000,
|
||||
MaxPods: 110,
|
||||
OOMScoreAdj: -999,
|
||||
PodsPerCore: 100,
|
||||
Port: 65535,
|
||||
ReadOnlyPort: 0,
|
||||
RegistryBurst: 10,
|
||||
RegistryPullQPS: 5,
|
||||
HairpinMode: kubeletconfig.PromiscuousBridge,
|
||||
NodeLeaseDurationSeconds: 1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 25 * time.Millisecond},
|
||||
TopologyManagerScope: kubeletconfig.PodTopologyManagerScope,
|
||||
TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
|
||||
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
|
||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
|
||||
FeatureGates: map[string]bool{
|
||||
"CustomCPUCFSQuotaPeriod": true,
|
||||
"GracefulNodeShutdown": true,
|
||||
},
|
||||
}
|
||||
if allErrors := ValidateKubeletConfiguration(successCase1); allErrors != nil {
|
||||
@ -64,37 +67,40 @@ func TestValidateKubeletConfiguration(t *testing.T) {
|
||||
}
|
||||
|
||||
successCase2 := &kubeletconfig.KubeletConfiguration{
|
||||
CgroupsPerQOS: true,
|
||||
EnforceNodeAllocatable: []string{"pods"},
|
||||
SystemReservedCgroup: "",
|
||||
KubeReservedCgroup: "",
|
||||
SystemCgroups: "",
|
||||
CgroupRoot: "",
|
||||
EventBurst: 10,
|
||||
EventRecordQPS: 5,
|
||||
HealthzPort: 10248,
|
||||
ImageGCHighThresholdPercent: 85,
|
||||
ImageGCLowThresholdPercent: 80,
|
||||
IPTablesDropBit: 15,
|
||||
IPTablesMasqueradeBit: 14,
|
||||
KubeAPIBurst: 10,
|
||||
KubeAPIQPS: 5,
|
||||
MaxOpenFiles: 1000000,
|
||||
MaxPods: 110,
|
||||
OOMScoreAdj: -999,
|
||||
PodsPerCore: 100,
|
||||
Port: 65535,
|
||||
ReadOnlyPort: 0,
|
||||
RegistryBurst: 10,
|
||||
RegistryPullQPS: 5,
|
||||
HairpinMode: kubeletconfig.PromiscuousBridge,
|
||||
NodeLeaseDurationSeconds: 1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
|
||||
ReservedSystemCPUs: "0-3",
|
||||
TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope,
|
||||
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
|
||||
CgroupsPerQOS: true,
|
||||
EnforceNodeAllocatable: []string{"pods"},
|
||||
SystemReservedCgroup: "",
|
||||
KubeReservedCgroup: "",
|
||||
SystemCgroups: "",
|
||||
CgroupRoot: "",
|
||||
EventBurst: 10,
|
||||
EventRecordQPS: 5,
|
||||
HealthzPort: 10248,
|
||||
ImageGCHighThresholdPercent: 85,
|
||||
ImageGCLowThresholdPercent: 80,
|
||||
IPTablesDropBit: 15,
|
||||
IPTablesMasqueradeBit: 14,
|
||||
KubeAPIBurst: 10,
|
||||
KubeAPIQPS: 5,
|
||||
MaxOpenFiles: 1000000,
|
||||
MaxPods: 110,
|
||||
OOMScoreAdj: -999,
|
||||
PodsPerCore: 100,
|
||||
Port: 65535,
|
||||
ReadOnlyPort: 0,
|
||||
RegistryBurst: 10,
|
||||
RegistryPullQPS: 5,
|
||||
HairpinMode: kubeletconfig.PromiscuousBridge,
|
||||
NodeLeaseDurationSeconds: 1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
|
||||
ReservedSystemCPUs: "0-3",
|
||||
TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope,
|
||||
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
|
||||
ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute},
|
||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
|
||||
FeatureGates: map[string]bool{
|
||||
"CustomCPUCFSQuotaPeriod": true,
|
||||
"GracefulNodeShutdown": true,
|
||||
},
|
||||
}
|
||||
if allErrors := ValidateKubeletConfiguration(successCase2); allErrors != nil {
|
||||
@ -102,68 +108,73 @@ func TestValidateKubeletConfiguration(t *testing.T) {
|
||||
}
|
||||
|
||||
errorCase1 := &kubeletconfig.KubeletConfiguration{
|
||||
CgroupsPerQOS: false,
|
||||
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved", "illegal-key"},
|
||||
SystemCgroups: "/",
|
||||
CgroupRoot: "",
|
||||
EventBurst: -10,
|
||||
EventRecordQPS: -10,
|
||||
HealthzPort: -10,
|
||||
ImageGCHighThresholdPercent: 101,
|
||||
ImageGCLowThresholdPercent: 101,
|
||||
IPTablesDropBit: -10,
|
||||
IPTablesMasqueradeBit: -10,
|
||||
KubeAPIBurst: -10,
|
||||
KubeAPIQPS: -10,
|
||||
MaxOpenFiles: -10,
|
||||
MaxPods: -10,
|
||||
OOMScoreAdj: -1001,
|
||||
PodsPerCore: -10,
|
||||
Port: 0,
|
||||
ReadOnlyPort: -10,
|
||||
RegistryBurst: -10,
|
||||
RegistryPullQPS: -10,
|
||||
HairpinMode: "foo",
|
||||
NodeLeaseDurationSeconds: -1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond},
|
||||
CgroupsPerQOS: false,
|
||||
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved", "illegal-key"},
|
||||
SystemCgroups: "/",
|
||||
CgroupRoot: "",
|
||||
EventBurst: -10,
|
||||
EventRecordQPS: -10,
|
||||
HealthzPort: -10,
|
||||
ImageGCHighThresholdPercent: 101,
|
||||
ImageGCLowThresholdPercent: 101,
|
||||
IPTablesDropBit: -10,
|
||||
IPTablesMasqueradeBit: -10,
|
||||
KubeAPIBurst: -10,
|
||||
KubeAPIQPS: -10,
|
||||
MaxOpenFiles: -10,
|
||||
MaxPods: -10,
|
||||
OOMScoreAdj: -1001,
|
||||
PodsPerCore: -10,
|
||||
Port: 0,
|
||||
ReadOnlyPort: -10,
|
||||
RegistryBurst: -10,
|
||||
RegistryPullQPS: -10,
|
||||
HairpinMode: "foo",
|
||||
NodeLeaseDurationSeconds: -1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond},
|
||||
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
|
||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
|
||||
}
|
||||
const numErrsErrorCase1 = 27
|
||||
const numErrsErrorCase1 = 28
|
||||
if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 {
|
||||
t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors()))
|
||||
}
|
||||
|
||||
errorCase2 := &kubeletconfig.KubeletConfiguration{
|
||||
CgroupsPerQOS: true,
|
||||
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
|
||||
SystemReservedCgroup: "/system.slice",
|
||||
KubeReservedCgroup: "/kubelet.service",
|
||||
SystemCgroups: "",
|
||||
CgroupRoot: "",
|
||||
EventBurst: 10,
|
||||
EventRecordQPS: 5,
|
||||
HealthzPort: 10248,
|
||||
ImageGCHighThresholdPercent: 85,
|
||||
ImageGCLowThresholdPercent: 80,
|
||||
IPTablesDropBit: 15,
|
||||
IPTablesMasqueradeBit: 14,
|
||||
KubeAPIBurst: 10,
|
||||
KubeAPIQPS: 5,
|
||||
MaxOpenFiles: 1000000,
|
||||
MaxPods: 110,
|
||||
OOMScoreAdj: -999,
|
||||
PodsPerCore: 100,
|
||||
Port: 65535,
|
||||
ReadOnlyPort: 0,
|
||||
RegistryBurst: 10,
|
||||
RegistryPullQPS: 5,
|
||||
HairpinMode: kubeletconfig.PromiscuousBridge,
|
||||
NodeLeaseDurationSeconds: 1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
|
||||
ReservedSystemCPUs: "0-3",
|
||||
TopologyManagerScope: "invalid",
|
||||
TopologyManagerPolicy: "invalid",
|
||||
CgroupsPerQOS: true,
|
||||
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
|
||||
SystemReservedCgroup: "/system.slice",
|
||||
KubeReservedCgroup: "/kubelet.service",
|
||||
SystemCgroups: "",
|
||||
CgroupRoot: "",
|
||||
EventBurst: 10,
|
||||
EventRecordQPS: 5,
|
||||
HealthzPort: 10248,
|
||||
ImageGCHighThresholdPercent: 85,
|
||||
ImageGCLowThresholdPercent: 80,
|
||||
IPTablesDropBit: 15,
|
||||
IPTablesMasqueradeBit: 14,
|
||||
KubeAPIBurst: 10,
|
||||
KubeAPIQPS: 5,
|
||||
MaxOpenFiles: 1000000,
|
||||
MaxPods: 110,
|
||||
OOMScoreAdj: -999,
|
||||
PodsPerCore: 100,
|
||||
Port: 65535,
|
||||
ReadOnlyPort: 0,
|
||||
RegistryBurst: 10,
|
||||
RegistryPullQPS: 5,
|
||||
HairpinMode: kubeletconfig.PromiscuousBridge,
|
||||
NodeLeaseDurationSeconds: 1,
|
||||
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
|
||||
ReservedSystemCPUs: "0-3",
|
||||
TopologyManagerScope: "invalid",
|
||||
TopologyManagerPolicy: "invalid",
|
||||
ShutdownGracePeriod: metav1.Duration{Duration: 40 * time.Second},
|
||||
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
|
||||
FeatureGates: map[string]bool{
|
||||
"CustomCPUCFSQuotaPeriod": true,
|
||||
"GracefulNodeShutdown": true,
|
||||
},
|
||||
}
|
||||
const numErrsErrorCase2 = 3
|
||||
|
2
pkg/kubelet/apis/config/zz_generated.deepcopy.go
generated
2
pkg/kubelet/apis/config/zz_generated.deepcopy.go
generated
@ -271,6 +271,8 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
|
||||
copy(*out, *in)
|
||||
}
|
||||
out.Logging = in.Logging
|
||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -84,6 +84,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics/collectors"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network/dns"
|
||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
|
||||
oomwatcher "k8s.io/kubernetes/pkg/kubelet/oom"
|
||||
"k8s.io/kubernetes/pkg/kubelet/pleg"
|
||||
"k8s.io/kubernetes/pkg/kubelet/pluginmanager"
|
||||
@ -794,6 +795,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
||||
v1.NamespaceNodeLease,
|
||||
util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName)))
|
||||
|
||||
klet.shutdownManager = nodeshutdown.NewManager(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeCfg.ShutdownGracePeriod.Duration, kubeCfg.ShutdownGracePeriodCriticalPods.Duration)
|
||||
|
||||
// Finally, put the most recent version of the config on the Kubelet, so
|
||||
// people can see how it was configured.
|
||||
klet.kubeletConfiguration = *kubeCfg
|
||||
@ -1137,6 +1140,9 @@ type Kubelet struct {
|
||||
|
||||
// Handles RuntimeClass objects for the Kubelet.
|
||||
runtimeClassManager *runtimeclass.Manager
|
||||
|
||||
// Handles node shutdown events for the Node.
|
||||
shutdownManager *nodeshutdown.Manager
|
||||
}
|
||||
|
||||
// ListPodStats is delegated to StatsProvider, which implements stats.Provider interface
|
||||
@ -1353,6 +1359,12 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
|
||||
// Start the plugin manager
|
||||
klog.V(4).Infof("starting plugin manager")
|
||||
go kl.pluginManager.Run(kl.sourcesReady, wait.NeverStop)
|
||||
|
||||
err = kl.shutdownManager.Start()
|
||||
if err != nil {
|
||||
// The shutdown manager is not critical for kubelet, so log failure, but don't block Kubelet startup if there was a failure starting it.
|
||||
klog.Errorf("Failed to start node shutdown manager: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the kubelet reacting to config updates
|
||||
|
@ -600,7 +600,7 @@ func (kl *Kubelet) defaultNodeStatusFuncs() []func(*v1.Node) error {
|
||||
nodestatus.MemoryPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderMemoryPressure, kl.recordNodeStatusEvent),
|
||||
nodestatus.DiskPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderDiskPressure, kl.recordNodeStatusEvent),
|
||||
nodestatus.PIDPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderPIDPressure, kl.recordNodeStatusEvent),
|
||||
nodestatus.ReadyCondition(kl.clock.Now, kl.runtimeState.runtimeErrors, kl.runtimeState.networkErrors, kl.runtimeState.storageErrors, validateHostFunc, kl.containerManager.Status, kl.recordNodeStatusEvent),
|
||||
nodestatus.ReadyCondition(kl.clock.Now, kl.runtimeState.runtimeErrors, kl.runtimeState.networkErrors, kl.runtimeState.storageErrors, validateHostFunc, kl.containerManager.Status, kl.shutdownManager.ShutdownStatus, kl.recordNodeStatusEvent),
|
||||
nodestatus.VolumesInUse(kl.volumeManager.ReconcilerStatesHasBeenSynced, kl.volumeManager.GetVolumesInUse),
|
||||
// TODO(mtaufen): I decided not to move this setter for now, since all it does is send an event
|
||||
// and record state back to the Kubelet runtime object. In the future, I'd like to isolate
|
||||
|
127
pkg/kubelet/nodeshutdown/BUILD
Normal file
127
pkg/kubelet/nodeshutdown/BUILD
Normal file
@ -0,0 +1,127 @@
|
||||
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"nodeshutdown_manager_linux.go",
|
||||
"nodeshutdown_manager_others.go",
|
||||
],
|
||||
importpath = "k8s.io/kubernetes/pkg/kubelet/nodeshutdown",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = select({
|
||||
"@io_bazel_rules_go//go/platform:aix": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:android": [
|
||||
"//pkg/features:go_default_library",
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
"//pkg/kubelet/nodeshutdown/systemd:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
|
||||
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
|
||||
"//vendor/github.com/godbus/dbus/v5:go_default_library",
|
||||
"//vendor/k8s.io/klog/v2:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:darwin": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:dragonfly": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:freebsd": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:illumos": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:ios": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:js": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:linux": [
|
||||
"//pkg/features:go_default_library",
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
"//pkg/kubelet/nodeshutdown/systemd:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
|
||||
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
|
||||
"//vendor/github.com/godbus/dbus/v5:go_default_library",
|
||||
"//vendor/k8s.io/klog/v2:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:nacl": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:netbsd": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:openbsd": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:plan9": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:solaris": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:windows": [
|
||||
"//pkg/kubelet/eviction:go_default_library",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [
|
||||
":package-srcs",
|
||||
"//pkg/kubelet/nodeshutdown/systemd:all-srcs",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = ["nodeshutdown_manager_linux_test.go"],
|
||||
embed = [":go_default_library"],
|
||||
deps = select({
|
||||
"@io_bazel_rules_go//go/platform:android": [
|
||||
"//pkg/apis/scheduling:go_default_library",
|
||||
"//pkg/features:go_default_library",
|
||||
"//pkg/kubelet/nodeshutdown/systemd:go_default_library",
|
||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
|
||||
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
|
||||
"//staging/src/k8s.io/component-base/featuregate/testing:go_default_library",
|
||||
"//vendor/github.com/stretchr/testify/assert:go_default_library",
|
||||
],
|
||||
"@io_bazel_rules_go//go/platform:linux": [
|
||||
"//pkg/apis/scheduling:go_default_library",
|
||||
"//pkg/features:go_default_library",
|
||||
"//pkg/kubelet/nodeshutdown/systemd:go_default_library",
|
||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
|
||||
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
|
||||
"//staging/src/k8s.io/component-base/featuregate/testing:go_default_library",
|
||||
"//vendor/github.com/stretchr/testify/assert:go_default_library",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
255
pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go
Normal file
255
pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go
Normal file
@ -0,0 +1,255 @@
|
||||
// +build linux
|
||||
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
// Package nodeshutdown can watch for node level shutdown events and trigger graceful termination of pods running on the node prior to a system shutdown.
|
||||
package nodeshutdown
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/godbus/dbus/v5"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
||||
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||
)
|
||||
|
||||
const (
|
||||
nodeShutdownReason = "Shutdown"
|
||||
nodeShutdownMessage = "Node is shutting, evicting pods"
|
||||
)
|
||||
|
||||
var systemDbus = func() (dbusInhibiter, error) {
|
||||
bus, err := dbus.SystemBus()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &systemd.DBusCon{SystemBus: bus}, nil
|
||||
}
|
||||
|
||||
type dbusInhibiter interface {
|
||||
CurrentInhibitDelay() (time.Duration, error)
|
||||
InhibitShutdown() (systemd.InhibitLock, error)
|
||||
ReleaseInhibitLock(lock systemd.InhibitLock) error
|
||||
ReloadLogindConf() error
|
||||
MonitorShutdown() (<-chan bool, error)
|
||||
OverrideInhibitDelay(inhibitDelayMax time.Duration) error
|
||||
}
|
||||
|
||||
// Manager has functions that can be used to interact with the Node Shutdown Manager.
|
||||
type Manager struct {
|
||||
shutdownGracePeriodRequested time.Duration
|
||||
shutdownGracePeriodCriticalPods time.Duration
|
||||
|
||||
getPods eviction.ActivePodsFunc
|
||||
killPod eviction.KillPodFunc
|
||||
|
||||
dbusCon dbusInhibiter
|
||||
inhibitLock systemd.InhibitLock
|
||||
|
||||
nodeShuttingDownMutex sync.Mutex
|
||||
nodeShuttingDownNow bool
|
||||
|
||||
clock clock.Clock
|
||||
}
|
||||
|
||||
// NewManager returns a new node shutdown manager.
|
||||
func NewManager(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) *Manager {
|
||||
return &Manager{
|
||||
getPods: getPodsFunc,
|
||||
killPod: killPodFunc,
|
||||
shutdownGracePeriodRequested: shutdownGracePeriodRequested,
|
||||
shutdownGracePeriodCriticalPods: shutdownGracePeriodCriticalPods,
|
||||
clock: clock.RealClock{},
|
||||
}
|
||||
}
|
||||
|
||||
// Start starts the node shutdown manager and will start watching the node for shutdown events.
|
||||
func (m *Manager) Start() error {
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||
return nil
|
||||
}
|
||||
if m.shutdownGracePeriodRequested == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
systemBus, err := systemDbus()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.dbusCon = systemBus
|
||||
|
||||
currentInhibitDelay, err := m.dbusCon.CurrentInhibitDelay()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than shutdownGracePeriodRequested, attempt to update the value to shutdownGracePeriodRequested.
|
||||
if m.shutdownGracePeriodRequested > currentInhibitDelay {
|
||||
err := m.dbusCon.OverrideInhibitDelay(m.shutdownGracePeriodRequested)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err)
|
||||
}
|
||||
|
||||
err = m.dbusCon.ReloadLogindConf()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Read the current inhibitDelay again, if the override was successful, currentInhibitDelay will be equal to shutdownGracePeriodRequested.
|
||||
updatedInhibitDelay, err := m.dbusCon.CurrentInhibitDelay()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if updatedInhibitDelay != m.shutdownGracePeriodRequested {
|
||||
return fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", m.shutdownGracePeriodRequested, updatedInhibitDelay)
|
||||
}
|
||||
}
|
||||
|
||||
err = m.aquireInhibitLock()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
events, err := m.dbusCon.MonitorShutdown()
|
||||
if err != nil {
|
||||
releaseErr := m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
|
||||
if releaseErr != nil {
|
||||
return fmt.Errorf("failed releasing inhibitLock: %v and failed monitoring shutdown: %v", releaseErr, err)
|
||||
}
|
||||
return fmt.Errorf("failed to monitor shutdown: %v", err)
|
||||
}
|
||||
|
||||
go func() {
|
||||
// Monitor for shutdown events. This follows the logind Inhibit Delay pattern described on https://www.freedesktop.org/wiki/Software/systemd/inhibit/
|
||||
// 1. When shutdown manager starts, an inhibit lock is taken.
|
||||
// 2. When shutdown(true) event is received, process the shutdown and release the inhibit lock.
|
||||
// 3. When shutdown(false) event is received, this indicates a previous shutdown was cancelled. In this case, acquire the inhibit lock again.
|
||||
for {
|
||||
select {
|
||||
case isShuttingDown := <-events:
|
||||
klog.V(1).Infof("Shutdown manager detected new shutdown event, isNodeShuttingDownNow: %t", isShuttingDown)
|
||||
|
||||
m.nodeShuttingDownMutex.Lock()
|
||||
m.nodeShuttingDownNow = isShuttingDown
|
||||
m.nodeShuttingDownMutex.Unlock()
|
||||
|
||||
if isShuttingDown {
|
||||
m.processShutdownEvent()
|
||||
} else {
|
||||
m.aquireInhibitLock()
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) aquireInhibitLock() error {
|
||||
lock, err := m.dbusCon.InhibitShutdown()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.inhibitLock = lock
|
||||
return nil
|
||||
}
|
||||
|
||||
// ShutdownStatus will return an error if the node is currently shutting down.
|
||||
func (m *Manager) ShutdownStatus() error {
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) {
|
||||
return nil
|
||||
}
|
||||
|
||||
m.nodeShuttingDownMutex.Lock()
|
||||
defer m.nodeShuttingDownMutex.Unlock()
|
||||
|
||||
if m.nodeShuttingDownNow {
|
||||
return fmt.Errorf("node is shutting down")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) processShutdownEvent() error {
|
||||
klog.V(1).Infof("Shutdown manager processing shutdown event")
|
||||
activePods := m.getPods()
|
||||
|
||||
nonCriticalPodGracePeriod := m.shutdownGracePeriodRequested - m.shutdownGracePeriodCriticalPods
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(activePods))
|
||||
for _, pod := range activePods {
|
||||
go func(pod *v1.Pod) {
|
||||
defer wg.Done()
|
||||
|
||||
var gracePeriodOverride int64
|
||||
if kubelettypes.IsCriticalPod(pod) {
|
||||
gracePeriodOverride = int64(m.shutdownGracePeriodCriticalPods.Seconds())
|
||||
m.clock.Sleep(nonCriticalPodGracePeriod)
|
||||
} else {
|
||||
gracePeriodOverride = int64(nonCriticalPodGracePeriod.Seconds())
|
||||
}
|
||||
|
||||
// If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod.
|
||||
if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride {
|
||||
gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds
|
||||
}
|
||||
|
||||
klog.V(1).Infof("Shutdown manager killing pod %q with gracePeriod: %v seconds", format.Pod(pod), gracePeriodOverride)
|
||||
|
||||
status := v1.PodStatus{
|
||||
Phase: v1.PodFailed,
|
||||
Message: nodeShutdownMessage,
|
||||
Reason: nodeShutdownReason,
|
||||
}
|
||||
|
||||
err := m.killPod(pod, status, &gracePeriodOverride)
|
||||
if err != nil {
|
||||
klog.V(1).Infof("Shutdown manager failed killing pod %q: %v", format.Pod(pod), err)
|
||||
} else {
|
||||
klog.V(1).Infof("Shutdown manager finished killing pod %q", format.Pod(pod))
|
||||
}
|
||||
}(pod)
|
||||
}
|
||||
|
||||
c := make(chan struct{})
|
||||
go func() {
|
||||
defer close(c)
|
||||
wg.Wait()
|
||||
}()
|
||||
|
||||
// We want to ensure that inhibitLock is released, so only wait up to the shutdownGracePeriodRequested timeout.
|
||||
select {
|
||||
case <-c:
|
||||
break
|
||||
case <-time.After(m.shutdownGracePeriodRequested):
|
||||
klog.V(1).Infof("Shutdown manager pod killing did not complete in %v", m.shutdownGracePeriodRequested)
|
||||
}
|
||||
|
||||
m.dbusCon.ReleaseInhibitLock(m.inhibitLock)
|
||||
klog.V(1).Infof("Shutdown manager completed processing shutdown event, node will shutdown shortly")
|
||||
|
||||
return nil
|
||||
}
|
261
pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go
Normal file
261
pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go
Normal file
@ -0,0 +1,261 @@
|
||||
// +build linux
|
||||
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodeshutdown
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||
"k8s.io/kubernetes/pkg/apis/scheduling"
|
||||
pkgfeatures "k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd"
|
||||
)
|
||||
|
||||
type fakeDbus struct {
|
||||
currentInhibitDelay time.Duration
|
||||
overrideSystemInhibitDelay time.Duration
|
||||
shutdownChan chan bool
|
||||
|
||||
didInhibitShutdown bool
|
||||
didOverrideInhibitDelay bool
|
||||
}
|
||||
|
||||
func (f *fakeDbus) CurrentInhibitDelay() (time.Duration, error) {
|
||||
if f.didOverrideInhibitDelay {
|
||||
return f.overrideSystemInhibitDelay, nil
|
||||
}
|
||||
return f.currentInhibitDelay, nil
|
||||
}
|
||||
|
||||
func (f *fakeDbus) InhibitShutdown() (systemd.InhibitLock, error) {
|
||||
f.didInhibitShutdown = true
|
||||
return systemd.InhibitLock(0), nil
|
||||
}
|
||||
|
||||
func (f *fakeDbus) ReleaseInhibitLock(lock systemd.InhibitLock) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeDbus) ReloadLogindConf() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeDbus) MonitorShutdown() (<-chan bool, error) {
|
||||
return f.shutdownChan, nil
|
||||
}
|
||||
|
||||
func (f *fakeDbus) OverrideInhibitDelay(inhibitDelayMax time.Duration) error {
|
||||
f.didOverrideInhibitDelay = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func makePod(name string, criticalPod bool, terminationGracePeriod *int64) *v1.Pod {
|
||||
var priority int32
|
||||
if criticalPod {
|
||||
priority = scheduling.SystemCriticalPriority
|
||||
}
|
||||
|
||||
return &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: name,
|
||||
UID: types.UID(name),
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Priority: &priority,
|
||||
TerminationGracePeriodSeconds: terminationGracePeriod,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestManager(t *testing.T) {
|
||||
normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", false /* criticalPod */, nil /* terminationGracePeriod */)
|
||||
criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", true /* criticalPod */, nil /* terminationGracePeriod */)
|
||||
|
||||
shortGracePeriod := int64(2)
|
||||
normalPodGracePeriod := makePod("normal-pod-grace-period", false /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */)
|
||||
criticalPodGracePeriod := makePod("critical-pod-grace-period", true /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */)
|
||||
|
||||
longGracePeriod := int64(1000)
|
||||
normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", false /* criticalPod */, &longGracePeriod /* terminationGracePeriod */)
|
||||
|
||||
var tests = []struct {
|
||||
desc string
|
||||
activePods []*v1.Pod
|
||||
shutdownGracePeriodRequested time.Duration
|
||||
shutdownGracePeriodCriticalPods time.Duration
|
||||
systemInhibitDelay time.Duration
|
||||
overrideSystemInhibitDelay time.Duration
|
||||
expectedDidOverrideInhibitDelay bool
|
||||
expectedPodToGracePeriodOverride map[string]int64
|
||||
expectedError error
|
||||
}{
|
||||
{
|
||||
desc: "no override (total=30s, critical=10s)",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(30 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second),
|
||||
systemInhibitDelay: time.Duration(40 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(40 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: false,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10},
|
||||
},
|
||||
{
|
||||
desc: "no override (total=30s, critical=10s) pods with terminationGracePeriod and without",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod, normalPodGracePeriod, criticalPodGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(30 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second),
|
||||
systemInhibitDelay: time.Duration(40 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(40 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: false,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10, "normal-pod-grace-period": 2, "critical-pod-grace-period": 2},
|
||||
},
|
||||
{
|
||||
desc: "no override (total=30s, critical=10s) pod with long terminationGracePeriod is overridden",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod, normalPodGracePeriod, criticalPodGracePeriod, normalPodLongGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(30 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second),
|
||||
systemInhibitDelay: time.Duration(40 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(40 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: false,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10, "normal-pod-grace-period": 2, "critical-pod-grace-period": 2, "normal-pod-long-grace-period": 20},
|
||||
},
|
||||
{
|
||||
desc: "no override (total=30, critical=0)",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(30 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(0 * time.Second),
|
||||
systemInhibitDelay: time.Duration(40 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(40 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: false,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 30, "critical-pod-nil-grace-period": 0},
|
||||
},
|
||||
{
|
||||
desc: "override successful (total=30, critical=10)",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(30 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second),
|
||||
systemInhibitDelay: time.Duration(5 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(30 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: true,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10},
|
||||
},
|
||||
{
|
||||
desc: "override unsuccessful",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(30 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second),
|
||||
systemInhibitDelay: time.Duration(5 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(5 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: true,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 5, "critical-pod-nil-grace-period": 0},
|
||||
expectedError: fmt.Errorf("unable to update logind InhibitDelayMaxSec to 30s (ShutdownGracePeriod), current value of InhibitDelayMaxSec (5s) is less than requested ShutdownGracePeriod"),
|
||||
},
|
||||
{
|
||||
desc: "override unsuccessful, zero time",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(5 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(5 * time.Second),
|
||||
systemInhibitDelay: time.Duration(0 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(0 * time.Second),
|
||||
expectedError: fmt.Errorf("unable to update logind InhibitDelayMaxSec to 5s (ShutdownGracePeriod), current value of InhibitDelayMaxSec (0s) is less than requested ShutdownGracePeriod"),
|
||||
},
|
||||
{
|
||||
desc: "no override, all time to critical pods",
|
||||
activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod},
|
||||
shutdownGracePeriodRequested: time.Duration(5 * time.Second),
|
||||
shutdownGracePeriodCriticalPods: time.Duration(5 * time.Second),
|
||||
systemInhibitDelay: time.Duration(5 * time.Second),
|
||||
overrideSystemInhibitDelay: time.Duration(5 * time.Second),
|
||||
expectedDidOverrideInhibitDelay: false,
|
||||
expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 0, "critical-pod-nil-grace-period": 5},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.desc, func(t *testing.T) {
|
||||
activePodsFunc := func() []*v1.Pod {
|
||||
return tc.activePods
|
||||
}
|
||||
|
||||
type PodKillInfo struct {
|
||||
Name string
|
||||
GracePeriod int64
|
||||
}
|
||||
|
||||
podKillChan := make(chan PodKillInfo)
|
||||
killPodsFunc := func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error {
|
||||
var gracePeriod int64
|
||||
if gracePeriodOverride != nil {
|
||||
gracePeriod = *gracePeriodOverride
|
||||
}
|
||||
podKillChan <- PodKillInfo{Name: pod.Name, GracePeriod: gracePeriod}
|
||||
return nil
|
||||
}
|
||||
|
||||
fakeShutdownChan := make(chan bool)
|
||||
fakeDbus := &fakeDbus{currentInhibitDelay: tc.systemInhibitDelay, shutdownChan: fakeShutdownChan, overrideSystemInhibitDelay: tc.overrideSystemInhibitDelay}
|
||||
systemDbus = func() (dbusInhibiter, error) {
|
||||
return fakeDbus, nil
|
||||
}
|
||||
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.GracefulNodeShutdown, true)()
|
||||
|
||||
manager := NewManager(activePodsFunc, killPodsFunc, tc.shutdownGracePeriodRequested, tc.shutdownGracePeriodCriticalPods)
|
||||
manager.clock = clock.NewFakeClock(time.Now())
|
||||
|
||||
err := manager.Start()
|
||||
if tc.expectedError != nil {
|
||||
if !strings.Contains(err.Error(), tc.expectedError.Error()) {
|
||||
t.Errorf("unexpected error message. Got: %s want %s", err.Error(), tc.expectedError.Error())
|
||||
}
|
||||
} else {
|
||||
assert.NoError(t, err, "expected manager.Start() to not return error")
|
||||
assert.True(t, fakeDbus.didInhibitShutdown, "expected that manager inhibited shutdown")
|
||||
assert.NoError(t, manager.ShutdownStatus(), "expected that manager does not return error since shutdown is not active")
|
||||
|
||||
// Send fake shutdown event
|
||||
fakeShutdownChan <- true
|
||||
|
||||
// Wait for all the pods to be killed
|
||||
killedPodsToGracePeriods := map[string]int64{}
|
||||
for i := 0; i < len(tc.activePods); i++ {
|
||||
select {
|
||||
case podKillInfo := <-podKillChan:
|
||||
killedPodsToGracePeriods[podKillInfo.Name] = podKillInfo.GracePeriod
|
||||
continue
|
||||
case <-time.After(1 * time.Second):
|
||||
t.Fatal()
|
||||
}
|
||||
}
|
||||
|
||||
assert.Error(t, manager.ShutdownStatus(), "expected that manager returns error since shutdown is active")
|
||||
assert.Equal(t, tc.expectedPodToGracePeriodOverride, killedPodsToGracePeriods)
|
||||
assert.Equal(t, tc.expectedDidOverrideInhibitDelay, fakeDbus.didOverrideInhibitDelay, "override system inhibit delay differs")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
43
pkg/kubelet/nodeshutdown/nodeshutdown_manager_others.go
Normal file
43
pkg/kubelet/nodeshutdown/nodeshutdown_manager_others.go
Normal file
@ -0,0 +1,43 @@
|
||||
// +build !linux
|
||||
|
||||
/*
|
||||
Copyright 2020 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package nodeshutdown
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||
)
|
||||
|
||||
// Manager is a fake node shutdown manager for non linux platforms.
|
||||
type Manager struct{}
|
||||
|
||||
// NewManager returns a fake node shutdown manager for non linux platforms.
|
||||
func NewManager(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) *Manager {
|
||||
return &Manager{}
|
||||
}
|
||||
|
||||
// Start is a no-op always returning nil for non linux platforms.
|
||||
func (m *Manager) Start() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ShutdownStatus is a no-op always returning nil for non linux platforms.
|
||||
func (m *Manager) ShutdownStatus() error {
|
||||
return nil
|
||||
}
|
@ -498,6 +498,7 @@ func ReadyCondition(
|
||||
storageErrorsFunc func() error, // typically Kubelet.runtimeState.storageErrors
|
||||
appArmorValidateHostFunc func() error, // typically Kubelet.appArmorValidator.ValidateHost, might be nil depending on whether there was an appArmorValidator
|
||||
cmStatusFunc func() cm.Status, // typically Kubelet.containerManager.Status
|
||||
nodeShutdownManagerErrorsFunc func() error, // typically kubelet.shutdownManager.errors.
|
||||
recordEventFunc func(eventType, event string), // typically Kubelet.recordNodeStatusEvent
|
||||
) Setter {
|
||||
return func(node *v1.Node) error {
|
||||
@ -512,7 +513,7 @@ func ReadyCondition(
|
||||
Message: "kubelet is posting ready status",
|
||||
LastHeartbeatTime: currentTime,
|
||||
}
|
||||
errs := []error{runtimeErrorsFunc(), networkErrorsFunc(), storageErrorsFunc()}
|
||||
errs := []error{runtimeErrorsFunc(), networkErrorsFunc(), storageErrorsFunc(), nodeShutdownManagerErrorsFunc()}
|
||||
requiredCapacities := []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods}
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
|
||||
requiredCapacities = append(requiredCapacities, v1.ResourceEphemeralStorage)
|
||||
|
@ -1109,15 +1109,16 @@ func TestReadyCondition(t *testing.T) {
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
desc string
|
||||
node *v1.Node
|
||||
runtimeErrors error
|
||||
networkErrors error
|
||||
storageErrors error
|
||||
appArmorValidateHostFunc func() error
|
||||
cmStatus cm.Status
|
||||
expectConditions []v1.NodeCondition
|
||||
expectEvents []testEvent
|
||||
desc string
|
||||
node *v1.Node
|
||||
runtimeErrors error
|
||||
networkErrors error
|
||||
storageErrors error
|
||||
appArmorValidateHostFunc func() error
|
||||
cmStatus cm.Status
|
||||
nodeShutdownManagerErrors error
|
||||
expectConditions []v1.NodeCondition
|
||||
expectEvents []testEvent
|
||||
}{
|
||||
{
|
||||
desc: "new, ready",
|
||||
@ -1154,6 +1155,12 @@ func TestReadyCondition(t *testing.T) {
|
||||
storageErrors: errors.New("some storage error"),
|
||||
expectConditions: []v1.NodeCondition{*makeReadyCondition(false, "some storage error", now, now)},
|
||||
},
|
||||
{
|
||||
desc: "new, not ready: shutdown active",
|
||||
node: withCapacity.DeepCopy(),
|
||||
nodeShutdownManagerErrors: errors.New("node is shutting down"),
|
||||
expectConditions: []v1.NodeCondition{*makeReadyCondition(false, "node is shutting down", now, now)},
|
||||
},
|
||||
{
|
||||
desc: "new, not ready: runtime and network errors",
|
||||
node: withCapacity.DeepCopy(),
|
||||
@ -1234,6 +1241,9 @@ func TestReadyCondition(t *testing.T) {
|
||||
cmStatusFunc := func() cm.Status {
|
||||
return tc.cmStatus
|
||||
}
|
||||
nodeShutdownErrorsFunc := func() error {
|
||||
return tc.nodeShutdownManagerErrors
|
||||
}
|
||||
events := []testEvent{}
|
||||
recordEventFunc := func(eventType, event string) {
|
||||
events = append(events, testEvent{
|
||||
@ -1242,7 +1252,7 @@ func TestReadyCondition(t *testing.T) {
|
||||
})
|
||||
}
|
||||
// construct setter
|
||||
setter := ReadyCondition(nowFunc, runtimeErrorsFunc, networkErrorsFunc, storageErrorsFunc, tc.appArmorValidateHostFunc, cmStatusFunc, recordEventFunc)
|
||||
setter := ReadyCondition(nowFunc, runtimeErrorsFunc, networkErrorsFunc, storageErrorsFunc, tc.appArmorValidateHostFunc, cmStatusFunc, nodeShutdownErrorsFunc, recordEventFunc)
|
||||
// call setter on node
|
||||
if err := setter(tc.node); err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
|
@ -815,6 +815,15 @@ type KubeletConfiguration struct {
|
||||
// Default: true
|
||||
// +optional
|
||||
EnableSystemLogHandler *bool `json:"enableSystemLogHandler,omitempty"`
|
||||
// ShutdownGracePeriod specifies the total duration that the node should delay the shutdown and total grace period for pod termination during a node shutdown.
|
||||
// Default: "30s"
|
||||
// +optional
|
||||
ShutdownGracePeriod metav1.Duration `json:"shutdownGracePeriod,omitempty"`
|
||||
// ShutdownGracePeriodCriticalPods specifies the duration used to terminate critical pods during a node shutdown. This should be less than ShutdownGracePeriod.
|
||||
// For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods.
|
||||
// Default: "10s"
|
||||
// +optional
|
||||
ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"`
|
||||
}
|
||||
|
||||
type KubeletAuthorizationMode string
|
||||
|
@ -301,6 +301,8 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
out.ShutdownGracePeriod = in.ShutdownGracePeriod
|
||||
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
|
||||
return
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user