From 16f71c6d47843c359e78c0eea2f34814f4cf055b Mon Sep 17 00:00:00 2001 From: David Porter Date: Mon, 2 Nov 2020 23:18:36 +0000 Subject: [PATCH] Implement shutdown manager in kubelet Implements KEP 2000, Graceful Node Shutdown: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2000-graceful-node-shutdown * Add new FeatureGate `GracefulNodeShutdown` to control enabling/disabling the feature * Add two new KubeletConfiguration options * `ShutdownGracePeriod` and `ShutdownGracePeriodCriticalPods` * Add new package, `nodeshutdown` that implements the Node shutdown manager * The node shutdown manager uses the systemd inhibit package, to create an system inhibitor, monitor for node shutdown events, and gracefully terminate pods upon a node shutdown. --- pkg/features/kube_features.go | 6 + pkg/kubelet/BUILD | 3 +- pkg/kubelet/apis/config/helpers_test.go | 2 + .../KubeletConfiguration/after/v1beta1.yaml | 2 + .../roundtrip/default/v1beta1.yaml | 2 + pkg/kubelet/apis/config/types.go | 7 + .../config/v1beta1/zz_generated.conversion.go | 4 + .../apis/config/validation/validation.go | 15 + .../apis/config/validation/validation_test.go | 233 ++++++++-------- .../apis/config/zz_generated.deepcopy.go | 2 + pkg/kubelet/kubelet.go | 12 + pkg/kubelet/kubelet_node_status.go | 2 +- pkg/kubelet/nodeshutdown/BUILD | 127 +++++++++ .../nodeshutdown_manager_linux.go | 255 +++++++++++++++++ .../nodeshutdown_manager_linux_test.go | 261 ++++++++++++++++++ .../nodeshutdown_manager_others.go | 43 +++ pkg/kubelet/nodestatus/setters.go | 3 +- pkg/kubelet/nodestatus/setters_test.go | 30 +- .../k8s.io/kubelet/config/v1beta1/types.go | 9 + .../config/v1beta1/zz_generated.deepcopy.go | 2 + 20 files changed, 896 insertions(+), 124 deletions(-) create mode 100644 pkg/kubelet/nodeshutdown/BUILD create mode 100644 pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go create mode 100644 pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go create mode 100644 pkg/kubelet/nodeshutdown/nodeshutdown_manager_others.go diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index f79c8d40193..fd1b1d742d9 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -701,6 +701,11 @@ const ( // Enable kubelet to pass pod's service account token to NodePublishVolume // call of CSI driver which is mounting volumes for that pod. CSIServiceAccountToken featuregate.Feature = "CSIServiceAccountToken" + + // owner: @bobbypage + // alpha: v1.20 + // Adds support for kubelet to detect node shutdown and gracefully terminate pods prior to the node being shutdown. + GracefulNodeShutdown featuregate.Feature = "GracefulNodeShutdown" ) func init() { @@ -806,6 +811,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS LoadBalancerIPMode: {Default: false, PreRelease: featuregate.Alpha}, ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default in v1.21 and remove in v1.22 KubeletCredentialProviders: {Default: false, PreRelease: featuregate.Alpha}, + GracefulNodeShutdown: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/kubelet/BUILD b/pkg/kubelet/BUILD index 15395f6994b..54f6f0cbd8d 100644 --- a/pkg/kubelet/BUILD +++ b/pkg/kubelet/BUILD @@ -71,6 +71,7 @@ go_library( "//pkg/kubelet/metrics:go_default_library", "//pkg/kubelet/metrics/collectors:go_default_library", "//pkg/kubelet/network/dns:go_default_library", + "//pkg/kubelet/nodeshutdown:go_default_library", "//pkg/kubelet/nodestatus:go_default_library", "//pkg/kubelet/oom:go_default_library", "//pkg/kubelet/pleg:go_default_library", @@ -308,7 +309,7 @@ filegroup( "//pkg/kubelet/logs:all-srcs", "//pkg/kubelet/metrics:all-srcs", "//pkg/kubelet/network:all-srcs", - "//pkg/kubelet/nodeshutdown/systemd:all-srcs", + "//pkg/kubelet/nodeshutdown:all-srcs", "//pkg/kubelet/nodestatus:all-srcs", "//pkg/kubelet/oom:all-srcs", "//pkg/kubelet/pleg:all-srcs", diff --git a/pkg/kubelet/apis/config/helpers_test.go b/pkg/kubelet/apis/config/helpers_test.go index 560311aa4f9..aee4ba83ff5 100644 --- a/pkg/kubelet/apis/config/helpers_test.go +++ b/pkg/kubelet/apis/config/helpers_test.go @@ -234,5 +234,7 @@ var ( "TypeMeta.Kind", "VolumeStatsAggPeriod.Duration", "VolumePluginDir", + "ShutdownGracePeriod.Duration", + "ShutdownGracePeriodCriticalPods.Duration", ) ) diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml index a82e874a1e3..364c43416ae 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/after/v1beta1.yaml @@ -67,6 +67,8 @@ registryPullQPS: 5 resolvConf: /etc/resolv.conf runtimeRequestTimeout: 2m0s serializeImagePulls: true +shutdownGracePeriod: 0s +shutdownGracePeriodCriticalPods: 0s streamingConnectionIdleTimeout: 4h0m0s syncFrequency: 1m0s topologyManagerPolicy: none diff --git a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml index a82e874a1e3..364c43416ae 100644 --- a/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml +++ b/pkg/kubelet/apis/config/scheme/testdata/KubeletConfiguration/roundtrip/default/v1beta1.yaml @@ -67,6 +67,8 @@ registryPullQPS: 5 resolvConf: /etc/resolv.conf runtimeRequestTimeout: 2m0s serializeImagePulls: true +shutdownGracePeriod: 0s +shutdownGracePeriodCriticalPods: 0s streamingConnectionIdleTimeout: 4h0m0s syncFrequency: 1m0s topologyManagerPolicy: none diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index c36aec6a563..d518a6cf412 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -375,6 +375,13 @@ type KubeletConfiguration struct { Logging componentbaseconfig.LoggingConfiguration // EnableSystemLogHandler enables /logs handler. EnableSystemLogHandler bool + // ShutdownGracePeriod specifies the total duration that the node should delay the shutdown and total grace period for pod termination during a node shutdown. + // Defaults to 30 seconds, requires GracefulNodeShutdown feature gate to be enabled. + ShutdownGracePeriod metav1.Duration + // ShutdownGracePeriodCriticalPods specifies the duration used to terminate critical pods during a node shutdown. This should be less than ShutdownGracePeriod. + // Defaults to 10 seconds, requires GracefulNodeShutdown feature gate to be enabled. + // For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods. + ShutdownGracePeriodCriticalPods metav1.Duration } // KubeletAuthorizationMode denotes the authorization mode for the kubelet diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index cf33ba227c6..09aae527ecc 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -350,6 +350,8 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in if err := v1.Convert_Pointer_bool_To_bool(&in.EnableSystemLogHandler, &out.EnableSystemLogHandler, s); err != nil { return err } + out.ShutdownGracePeriod = in.ShutdownGracePeriod + out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods return nil } @@ -501,6 +503,8 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in if err := v1.Convert_bool_To_Pointer_bool(&in.EnableSystemLogHandler, &out.EnableSystemLogHandler, s); err != nil { return err } + out.ShutdownGracePeriod = in.ShutdownGracePeriod + out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods return nil } diff --git a/pkg/kubelet/apis/config/validation/validation.go b/pkg/kubelet/apis/config/validation/validation.go index f2db970ee15..56069cae10b 100644 --- a/pkg/kubelet/apis/config/validation/validation.go +++ b/pkg/kubelet/apis/config/validation/validation.go @@ -140,6 +140,21 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error allErrors = append(allErrors, fmt.Errorf("invalid configuration: topologyManagerScope non-allowable value: %v", kc.TopologyManagerScope)) } + if localFeatureGate.Enabled(features.GracefulNodeShutdown) { + if kc.ShutdownGracePeriod.Duration < 0 || kc.ShutdownGracePeriodCriticalPods.Duration < 0 || kc.ShutdownGracePeriodCriticalPods.Duration > kc.ShutdownGracePeriod.Duration { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriod %v must be >= 0, ShutdownGracePeriodCriticalPods %v must be >= 0, and ShutdownGracePeriodCriticalPods %v must be <= ShutdownGracePeriod %v", kc.ShutdownGracePeriod, kc.ShutdownGracePeriodCriticalPods, kc.ShutdownGracePeriodCriticalPods, kc.ShutdownGracePeriod)) + } + if kc.ShutdownGracePeriod.Duration > 0 && kc.ShutdownGracePeriod.Duration < time.Duration(time.Second) { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriod %v must be either zero or otherwise >= 1 sec", kc.ShutdownGracePeriod)) + } + if kc.ShutdownGracePeriodCriticalPods.Duration > 0 && kc.ShutdownGracePeriodCriticalPods.Duration < time.Duration(time.Second) { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriodCriticalPods %v must be either zero or otherwise >= 1 sec", kc.ShutdownGracePeriodCriticalPods)) + } + } + if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) { + allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown")) + } + for _, val := range kc.EnforceNodeAllocatable { switch val { case kubetypes.NodeAllocatableEnforcementKey: diff --git a/pkg/kubelet/apis/config/validation/validation_test.go b/pkg/kubelet/apis/config/validation/validation_test.go index 72b53e971fc..9a00bda6e32 100644 --- a/pkg/kubelet/apis/config/validation/validation_test.go +++ b/pkg/kubelet/apis/config/validation/validation_test.go @@ -27,36 +27,39 @@ import ( func TestValidateKubeletConfiguration(t *testing.T) { successCase1 := &kubeletconfig.KubeletConfiguration{ - CgroupsPerQOS: true, - EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"}, - SystemReservedCgroup: "/system.slice", - KubeReservedCgroup: "/kubelet.service", - SystemCgroups: "", - CgroupRoot: "", - EventBurst: 10, - EventRecordQPS: 5, - HealthzPort: 10248, - ImageGCHighThresholdPercent: 85, - ImageGCLowThresholdPercent: 80, - IPTablesDropBit: 15, - IPTablesMasqueradeBit: 14, - KubeAPIBurst: 10, - KubeAPIQPS: 5, - MaxOpenFiles: 1000000, - MaxPods: 110, - OOMScoreAdj: -999, - PodsPerCore: 100, - Port: 65535, - ReadOnlyPort: 0, - RegistryBurst: 10, - RegistryPullQPS: 5, - HairpinMode: kubeletconfig.PromiscuousBridge, - NodeLeaseDurationSeconds: 1, - CPUCFSQuotaPeriod: metav1.Duration{Duration: 25 * time.Millisecond}, - TopologyManagerScope: kubeletconfig.PodTopologyManagerScope, - TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy, + CgroupsPerQOS: true, + EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"}, + SystemReservedCgroup: "/system.slice", + KubeReservedCgroup: "/kubelet.service", + SystemCgroups: "", + CgroupRoot: "", + EventBurst: 10, + EventRecordQPS: 5, + HealthzPort: 10248, + ImageGCHighThresholdPercent: 85, + ImageGCLowThresholdPercent: 80, + IPTablesDropBit: 15, + IPTablesMasqueradeBit: 14, + KubeAPIBurst: 10, + KubeAPIQPS: 5, + MaxOpenFiles: 1000000, + MaxPods: 110, + OOMScoreAdj: -999, + PodsPerCore: 100, + Port: 65535, + ReadOnlyPort: 0, + RegistryBurst: 10, + RegistryPullQPS: 5, + HairpinMode: kubeletconfig.PromiscuousBridge, + NodeLeaseDurationSeconds: 1, + CPUCFSQuotaPeriod: metav1.Duration{Duration: 25 * time.Millisecond}, + TopologyManagerScope: kubeletconfig.PodTopologyManagerScope, + TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy, + ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second}, + ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, FeatureGates: map[string]bool{ "CustomCPUCFSQuotaPeriod": true, + "GracefulNodeShutdown": true, }, } if allErrors := ValidateKubeletConfiguration(successCase1); allErrors != nil { @@ -64,37 +67,40 @@ func TestValidateKubeletConfiguration(t *testing.T) { } successCase2 := &kubeletconfig.KubeletConfiguration{ - CgroupsPerQOS: true, - EnforceNodeAllocatable: []string{"pods"}, - SystemReservedCgroup: "", - KubeReservedCgroup: "", - SystemCgroups: "", - CgroupRoot: "", - EventBurst: 10, - EventRecordQPS: 5, - HealthzPort: 10248, - ImageGCHighThresholdPercent: 85, - ImageGCLowThresholdPercent: 80, - IPTablesDropBit: 15, - IPTablesMasqueradeBit: 14, - KubeAPIBurst: 10, - KubeAPIQPS: 5, - MaxOpenFiles: 1000000, - MaxPods: 110, - OOMScoreAdj: -999, - PodsPerCore: 100, - Port: 65535, - ReadOnlyPort: 0, - RegistryBurst: 10, - RegistryPullQPS: 5, - HairpinMode: kubeletconfig.PromiscuousBridge, - NodeLeaseDurationSeconds: 1, - CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond}, - ReservedSystemCPUs: "0-3", - TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope, - TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy, + CgroupsPerQOS: true, + EnforceNodeAllocatable: []string{"pods"}, + SystemReservedCgroup: "", + KubeReservedCgroup: "", + SystemCgroups: "", + CgroupRoot: "", + EventBurst: 10, + EventRecordQPS: 5, + HealthzPort: 10248, + ImageGCHighThresholdPercent: 85, + ImageGCLowThresholdPercent: 80, + IPTablesDropBit: 15, + IPTablesMasqueradeBit: 14, + KubeAPIBurst: 10, + KubeAPIQPS: 5, + MaxOpenFiles: 1000000, + MaxPods: 110, + OOMScoreAdj: -999, + PodsPerCore: 100, + Port: 65535, + ReadOnlyPort: 0, + RegistryBurst: 10, + RegistryPullQPS: 5, + HairpinMode: kubeletconfig.PromiscuousBridge, + NodeLeaseDurationSeconds: 1, + CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond}, + ReservedSystemCPUs: "0-3", + TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope, + TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy, + ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute}, + ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0}, FeatureGates: map[string]bool{ "CustomCPUCFSQuotaPeriod": true, + "GracefulNodeShutdown": true, }, } if allErrors := ValidateKubeletConfiguration(successCase2); allErrors != nil { @@ -102,68 +108,73 @@ func TestValidateKubeletConfiguration(t *testing.T) { } errorCase1 := &kubeletconfig.KubeletConfiguration{ - CgroupsPerQOS: false, - EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved", "illegal-key"}, - SystemCgroups: "/", - CgroupRoot: "", - EventBurst: -10, - EventRecordQPS: -10, - HealthzPort: -10, - ImageGCHighThresholdPercent: 101, - ImageGCLowThresholdPercent: 101, - IPTablesDropBit: -10, - IPTablesMasqueradeBit: -10, - KubeAPIBurst: -10, - KubeAPIQPS: -10, - MaxOpenFiles: -10, - MaxPods: -10, - OOMScoreAdj: -1001, - PodsPerCore: -10, - Port: 0, - ReadOnlyPort: -10, - RegistryBurst: -10, - RegistryPullQPS: -10, - HairpinMode: "foo", - NodeLeaseDurationSeconds: -1, - CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond}, + CgroupsPerQOS: false, + EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved", "illegal-key"}, + SystemCgroups: "/", + CgroupRoot: "", + EventBurst: -10, + EventRecordQPS: -10, + HealthzPort: -10, + ImageGCHighThresholdPercent: 101, + ImageGCLowThresholdPercent: 101, + IPTablesDropBit: -10, + IPTablesMasqueradeBit: -10, + KubeAPIBurst: -10, + KubeAPIQPS: -10, + MaxOpenFiles: -10, + MaxPods: -10, + OOMScoreAdj: -1001, + PodsPerCore: -10, + Port: 0, + ReadOnlyPort: -10, + RegistryBurst: -10, + RegistryPullQPS: -10, + HairpinMode: "foo", + NodeLeaseDurationSeconds: -1, + CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond}, + ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second}, + ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, } - const numErrsErrorCase1 = 27 + const numErrsErrorCase1 = 28 if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 { t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors())) } errorCase2 := &kubeletconfig.KubeletConfiguration{ - CgroupsPerQOS: true, - EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"}, - SystemReservedCgroup: "/system.slice", - KubeReservedCgroup: "/kubelet.service", - SystemCgroups: "", - CgroupRoot: "", - EventBurst: 10, - EventRecordQPS: 5, - HealthzPort: 10248, - ImageGCHighThresholdPercent: 85, - ImageGCLowThresholdPercent: 80, - IPTablesDropBit: 15, - IPTablesMasqueradeBit: 14, - KubeAPIBurst: 10, - KubeAPIQPS: 5, - MaxOpenFiles: 1000000, - MaxPods: 110, - OOMScoreAdj: -999, - PodsPerCore: 100, - Port: 65535, - ReadOnlyPort: 0, - RegistryBurst: 10, - RegistryPullQPS: 5, - HairpinMode: kubeletconfig.PromiscuousBridge, - NodeLeaseDurationSeconds: 1, - CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond}, - ReservedSystemCPUs: "0-3", - TopologyManagerScope: "invalid", - TopologyManagerPolicy: "invalid", + CgroupsPerQOS: true, + EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"}, + SystemReservedCgroup: "/system.slice", + KubeReservedCgroup: "/kubelet.service", + SystemCgroups: "", + CgroupRoot: "", + EventBurst: 10, + EventRecordQPS: 5, + HealthzPort: 10248, + ImageGCHighThresholdPercent: 85, + ImageGCLowThresholdPercent: 80, + IPTablesDropBit: 15, + IPTablesMasqueradeBit: 14, + KubeAPIBurst: 10, + KubeAPIQPS: 5, + MaxOpenFiles: 1000000, + MaxPods: 110, + OOMScoreAdj: -999, + PodsPerCore: 100, + Port: 65535, + ReadOnlyPort: 0, + RegistryBurst: 10, + RegistryPullQPS: 5, + HairpinMode: kubeletconfig.PromiscuousBridge, + NodeLeaseDurationSeconds: 1, + CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond}, + ReservedSystemCPUs: "0-3", + TopologyManagerScope: "invalid", + TopologyManagerPolicy: "invalid", + ShutdownGracePeriod: metav1.Duration{Duration: 40 * time.Second}, + ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second}, FeatureGates: map[string]bool{ "CustomCPUCFSQuotaPeriod": true, + "GracefulNodeShutdown": true, }, } const numErrsErrorCase2 = 3 diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go index b54cad7e5e3..e458d832294 100644 --- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go @@ -271,6 +271,8 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { copy(*out, *in) } out.Logging = in.Logging + out.ShutdownGracePeriod = in.ShutdownGracePeriod + out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods return } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 646521a30b3..c5c6f0c9e4b 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -84,6 +84,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/kubelet/metrics/collectors" "k8s.io/kubernetes/pkg/kubelet/network/dns" + "k8s.io/kubernetes/pkg/kubelet/nodeshutdown" oomwatcher "k8s.io/kubernetes/pkg/kubelet/oom" "k8s.io/kubernetes/pkg/kubelet/pleg" "k8s.io/kubernetes/pkg/kubelet/pluginmanager" @@ -794,6 +795,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, v1.NamespaceNodeLease, util.SetNodeOwnerFunc(klet.heartbeatClient, string(klet.nodeName))) + klet.shutdownManager = nodeshutdown.NewManager(klet.GetActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeCfg.ShutdownGracePeriod.Duration, kubeCfg.ShutdownGracePeriodCriticalPods.Duration) + // Finally, put the most recent version of the config on the Kubelet, so // people can see how it was configured. klet.kubeletConfiguration = *kubeCfg @@ -1137,6 +1140,9 @@ type Kubelet struct { // Handles RuntimeClass objects for the Kubelet. runtimeClassManager *runtimeclass.Manager + + // Handles node shutdown events for the Node. + shutdownManager *nodeshutdown.Manager } // ListPodStats is delegated to StatsProvider, which implements stats.Provider interface @@ -1353,6 +1359,12 @@ func (kl *Kubelet) initializeRuntimeDependentModules() { // Start the plugin manager klog.V(4).Infof("starting plugin manager") go kl.pluginManager.Run(kl.sourcesReady, wait.NeverStop) + + err = kl.shutdownManager.Start() + if err != nil { + // The shutdown manager is not critical for kubelet, so log failure, but don't block Kubelet startup if there was a failure starting it. + klog.Errorf("Failed to start node shutdown manager: %v", err) + } } // Run starts the kubelet reacting to config updates diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 531633f02ae..cfc78297b0e 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -600,7 +600,7 @@ func (kl *Kubelet) defaultNodeStatusFuncs() []func(*v1.Node) error { nodestatus.MemoryPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderMemoryPressure, kl.recordNodeStatusEvent), nodestatus.DiskPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderDiskPressure, kl.recordNodeStatusEvent), nodestatus.PIDPressureCondition(kl.clock.Now, kl.evictionManager.IsUnderPIDPressure, kl.recordNodeStatusEvent), - nodestatus.ReadyCondition(kl.clock.Now, kl.runtimeState.runtimeErrors, kl.runtimeState.networkErrors, kl.runtimeState.storageErrors, validateHostFunc, kl.containerManager.Status, kl.recordNodeStatusEvent), + nodestatus.ReadyCondition(kl.clock.Now, kl.runtimeState.runtimeErrors, kl.runtimeState.networkErrors, kl.runtimeState.storageErrors, validateHostFunc, kl.containerManager.Status, kl.shutdownManager.ShutdownStatus, kl.recordNodeStatusEvent), nodestatus.VolumesInUse(kl.volumeManager.ReconcilerStatesHasBeenSynced, kl.volumeManager.GetVolumesInUse), // TODO(mtaufen): I decided not to move this setter for now, since all it does is send an event // and record state back to the Kubelet runtime object. In the future, I'd like to isolate diff --git a/pkg/kubelet/nodeshutdown/BUILD b/pkg/kubelet/nodeshutdown/BUILD new file mode 100644 index 00000000000..125c4c43c82 --- /dev/null +++ b/pkg/kubelet/nodeshutdown/BUILD @@ -0,0 +1,127 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "go_default_library", + srcs = [ + "nodeshutdown_manager_linux.go", + "nodeshutdown_manager_others.go", + ], + importpath = "k8s.io/kubernetes/pkg/kubelet/nodeshutdown", + visibility = ["//visibility:public"], + deps = select({ + "@io_bazel_rules_go//go/platform:aix": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:android": [ + "//pkg/features:go_default_library", + "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/nodeshutdown/systemd:go_default_library", + "//pkg/kubelet/types:go_default_library", + "//pkg/kubelet/util/format:go_default_library", + "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", + "//vendor/github.com/godbus/dbus/v5:go_default_library", + "//vendor/k8s.io/klog/v2:go_default_library", + ], + "@io_bazel_rules_go//go/platform:darwin": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:dragonfly": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:freebsd": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:illumos": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:ios": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:js": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:linux": [ + "//pkg/features:go_default_library", + "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/nodeshutdown/systemd:go_default_library", + "//pkg/kubelet/types:go_default_library", + "//pkg/kubelet/util/format:go_default_library", + "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", + "//vendor/github.com/godbus/dbus/v5:go_default_library", + "//vendor/k8s.io/klog/v2:go_default_library", + ], + "@io_bazel_rules_go//go/platform:nacl": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:netbsd": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:openbsd": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:plan9": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:solaris": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "@io_bazel_rules_go//go/platform:windows": [ + "//pkg/kubelet/eviction:go_default_library", + ], + "//conditions:default": [], + }), +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [ + ":package-srcs", + "//pkg/kubelet/nodeshutdown/systemd:all-srcs", + ], + tags = ["automanaged"], + visibility = ["//visibility:public"], +) + +go_test( + name = "go_default_test", + srcs = ["nodeshutdown_manager_linux_test.go"], + embed = [":go_default_library"], + deps = select({ + "@io_bazel_rules_go//go/platform:android": [ + "//pkg/apis/scheduling:go_default_library", + "//pkg/features:go_default_library", + "//pkg/kubelet/nodeshutdown/systemd:go_default_library", + "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", + "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", + "//vendor/github.com/stretchr/testify/assert:go_default_library", + ], + "@io_bazel_rules_go//go/platform:linux": [ + "//pkg/apis/scheduling:go_default_library", + "//pkg/features:go_default_library", + "//pkg/kubelet/nodeshutdown/systemd:go_default_library", + "//staging/src/k8s.io/api/core/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", + "//staging/src/k8s.io/component-base/featuregate/testing:go_default_library", + "//vendor/github.com/stretchr/testify/assert:go_default_library", + ], + "//conditions:default": [], + }), +) diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go new file mode 100644 index 00000000000..43adac645fc --- /dev/null +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux.go @@ -0,0 +1,255 @@ +// +build linux + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package nodeshutdown can watch for node level shutdown events and trigger graceful termination of pods running on the node prior to a system shutdown. +package nodeshutdown + +import ( + "fmt" + "sync" + "time" + + "github.com/godbus/dbus/v5" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/clock" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/eviction" + "k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd" + kubelettypes "k8s.io/kubernetes/pkg/kubelet/types" + "k8s.io/kubernetes/pkg/kubelet/util/format" +) + +const ( + nodeShutdownReason = "Shutdown" + nodeShutdownMessage = "Node is shutting, evicting pods" +) + +var systemDbus = func() (dbusInhibiter, error) { + bus, err := dbus.SystemBus() + if err != nil { + return nil, err + } + return &systemd.DBusCon{SystemBus: bus}, nil +} + +type dbusInhibiter interface { + CurrentInhibitDelay() (time.Duration, error) + InhibitShutdown() (systemd.InhibitLock, error) + ReleaseInhibitLock(lock systemd.InhibitLock) error + ReloadLogindConf() error + MonitorShutdown() (<-chan bool, error) + OverrideInhibitDelay(inhibitDelayMax time.Duration) error +} + +// Manager has functions that can be used to interact with the Node Shutdown Manager. +type Manager struct { + shutdownGracePeriodRequested time.Duration + shutdownGracePeriodCriticalPods time.Duration + + getPods eviction.ActivePodsFunc + killPod eviction.KillPodFunc + + dbusCon dbusInhibiter + inhibitLock systemd.InhibitLock + + nodeShuttingDownMutex sync.Mutex + nodeShuttingDownNow bool + + clock clock.Clock +} + +// NewManager returns a new node shutdown manager. +func NewManager(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) *Manager { + return &Manager{ + getPods: getPodsFunc, + killPod: killPodFunc, + shutdownGracePeriodRequested: shutdownGracePeriodRequested, + shutdownGracePeriodCriticalPods: shutdownGracePeriodCriticalPods, + clock: clock.RealClock{}, + } +} + +// Start starts the node shutdown manager and will start watching the node for shutdown events. +func (m *Manager) Start() error { + if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) { + return nil + } + if m.shutdownGracePeriodRequested == 0 { + return nil + } + + systemBus, err := systemDbus() + if err != nil { + return err + } + m.dbusCon = systemBus + + currentInhibitDelay, err := m.dbusCon.CurrentInhibitDelay() + if err != nil { + return err + } + + // If the logind's InhibitDelayMaxUSec as configured in (logind.conf) is less than shutdownGracePeriodRequested, attempt to update the value to shutdownGracePeriodRequested. + if m.shutdownGracePeriodRequested > currentInhibitDelay { + err := m.dbusCon.OverrideInhibitDelay(m.shutdownGracePeriodRequested) + if err != nil { + return fmt.Errorf("unable to override inhibit delay by shutdown manager: %v", err) + } + + err = m.dbusCon.ReloadLogindConf() + if err != nil { + return err + } + + // Read the current inhibitDelay again, if the override was successful, currentInhibitDelay will be equal to shutdownGracePeriodRequested. + updatedInhibitDelay, err := m.dbusCon.CurrentInhibitDelay() + if err != nil { + return err + } + + if updatedInhibitDelay != m.shutdownGracePeriodRequested { + return fmt.Errorf("node shutdown manager was unable to update logind InhibitDelayMaxSec to %v (ShutdownGracePeriod), current value of InhibitDelayMaxSec (%v) is less than requested ShutdownGracePeriod", m.shutdownGracePeriodRequested, updatedInhibitDelay) + } + } + + err = m.aquireInhibitLock() + if err != nil { + return err + } + + events, err := m.dbusCon.MonitorShutdown() + if err != nil { + releaseErr := m.dbusCon.ReleaseInhibitLock(m.inhibitLock) + if releaseErr != nil { + return fmt.Errorf("failed releasing inhibitLock: %v and failed monitoring shutdown: %v", releaseErr, err) + } + return fmt.Errorf("failed to monitor shutdown: %v", err) + } + + go func() { + // Monitor for shutdown events. This follows the logind Inhibit Delay pattern described on https://www.freedesktop.org/wiki/Software/systemd/inhibit/ + // 1. When shutdown manager starts, an inhibit lock is taken. + // 2. When shutdown(true) event is received, process the shutdown and release the inhibit lock. + // 3. When shutdown(false) event is received, this indicates a previous shutdown was cancelled. In this case, acquire the inhibit lock again. + for { + select { + case isShuttingDown := <-events: + klog.V(1).Infof("Shutdown manager detected new shutdown event, isNodeShuttingDownNow: %t", isShuttingDown) + + m.nodeShuttingDownMutex.Lock() + m.nodeShuttingDownNow = isShuttingDown + m.nodeShuttingDownMutex.Unlock() + + if isShuttingDown { + m.processShutdownEvent() + } else { + m.aquireInhibitLock() + } + } + } + }() + return nil +} + +func (m *Manager) aquireInhibitLock() error { + lock, err := m.dbusCon.InhibitShutdown() + if err != nil { + return err + } + m.inhibitLock = lock + return nil +} + +// ShutdownStatus will return an error if the node is currently shutting down. +func (m *Manager) ShutdownStatus() error { + if !utilfeature.DefaultFeatureGate.Enabled(features.GracefulNodeShutdown) { + return nil + } + + m.nodeShuttingDownMutex.Lock() + defer m.nodeShuttingDownMutex.Unlock() + + if m.nodeShuttingDownNow { + return fmt.Errorf("node is shutting down") + } + return nil +} + +func (m *Manager) processShutdownEvent() error { + klog.V(1).Infof("Shutdown manager processing shutdown event") + activePods := m.getPods() + + nonCriticalPodGracePeriod := m.shutdownGracePeriodRequested - m.shutdownGracePeriodCriticalPods + + var wg sync.WaitGroup + wg.Add(len(activePods)) + for _, pod := range activePods { + go func(pod *v1.Pod) { + defer wg.Done() + + var gracePeriodOverride int64 + if kubelettypes.IsCriticalPod(pod) { + gracePeriodOverride = int64(m.shutdownGracePeriodCriticalPods.Seconds()) + m.clock.Sleep(nonCriticalPodGracePeriod) + } else { + gracePeriodOverride = int64(nonCriticalPodGracePeriod.Seconds()) + } + + // If the pod's spec specifies a termination gracePeriod which is less than the gracePeriodOverride calculated, use the pod spec termination gracePeriod. + if pod.Spec.TerminationGracePeriodSeconds != nil && *pod.Spec.TerminationGracePeriodSeconds <= gracePeriodOverride { + gracePeriodOverride = *pod.Spec.TerminationGracePeriodSeconds + } + + klog.V(1).Infof("Shutdown manager killing pod %q with gracePeriod: %v seconds", format.Pod(pod), gracePeriodOverride) + + status := v1.PodStatus{ + Phase: v1.PodFailed, + Message: nodeShutdownMessage, + Reason: nodeShutdownReason, + } + + err := m.killPod(pod, status, &gracePeriodOverride) + if err != nil { + klog.V(1).Infof("Shutdown manager failed killing pod %q: %v", format.Pod(pod), err) + } else { + klog.V(1).Infof("Shutdown manager finished killing pod %q", format.Pod(pod)) + } + }(pod) + } + + c := make(chan struct{}) + go func() { + defer close(c) + wg.Wait() + }() + + // We want to ensure that inhibitLock is released, so only wait up to the shutdownGracePeriodRequested timeout. + select { + case <-c: + break + case <-time.After(m.shutdownGracePeriodRequested): + klog.V(1).Infof("Shutdown manager pod killing did not complete in %v", m.shutdownGracePeriodRequested) + } + + m.dbusCon.ReleaseInhibitLock(m.inhibitLock) + klog.V(1).Infof("Shutdown manager completed processing shutdown event, node will shutdown shortly") + + return nil +} diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go new file mode 100644 index 00000000000..ba3512791b6 --- /dev/null +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_linux_test.go @@ -0,0 +1,261 @@ +// +build linux + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeshutdown + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/clock" + utilfeature "k8s.io/apiserver/pkg/util/feature" + featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/kubernetes/pkg/apis/scheduling" + pkgfeatures "k8s.io/kubernetes/pkg/features" + "k8s.io/kubernetes/pkg/kubelet/nodeshutdown/systemd" +) + +type fakeDbus struct { + currentInhibitDelay time.Duration + overrideSystemInhibitDelay time.Duration + shutdownChan chan bool + + didInhibitShutdown bool + didOverrideInhibitDelay bool +} + +func (f *fakeDbus) CurrentInhibitDelay() (time.Duration, error) { + if f.didOverrideInhibitDelay { + return f.overrideSystemInhibitDelay, nil + } + return f.currentInhibitDelay, nil +} + +func (f *fakeDbus) InhibitShutdown() (systemd.InhibitLock, error) { + f.didInhibitShutdown = true + return systemd.InhibitLock(0), nil +} + +func (f *fakeDbus) ReleaseInhibitLock(lock systemd.InhibitLock) error { + return nil +} + +func (f *fakeDbus) ReloadLogindConf() error { + return nil +} + +func (f *fakeDbus) MonitorShutdown() (<-chan bool, error) { + return f.shutdownChan, nil +} + +func (f *fakeDbus) OverrideInhibitDelay(inhibitDelayMax time.Duration) error { + f.didOverrideInhibitDelay = true + return nil +} + +func makePod(name string, criticalPod bool, terminationGracePeriod *int64) *v1.Pod { + var priority int32 + if criticalPod { + priority = scheduling.SystemCriticalPriority + } + + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + UID: types.UID(name), + }, + Spec: v1.PodSpec{ + Priority: &priority, + TerminationGracePeriodSeconds: terminationGracePeriod, + }, + } +} + +func TestManager(t *testing.T) { + normalPodNoGracePeriod := makePod("normal-pod-nil-grace-period", false /* criticalPod */, nil /* terminationGracePeriod */) + criticalPodNoGracePeriod := makePod("critical-pod-nil-grace-period", true /* criticalPod */, nil /* terminationGracePeriod */) + + shortGracePeriod := int64(2) + normalPodGracePeriod := makePod("normal-pod-grace-period", false /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */) + criticalPodGracePeriod := makePod("critical-pod-grace-period", true /* criticalPod */, &shortGracePeriod /* terminationGracePeriod */) + + longGracePeriod := int64(1000) + normalPodLongGracePeriod := makePod("normal-pod-long-grace-period", false /* criticalPod */, &longGracePeriod /* terminationGracePeriod */) + + var tests = []struct { + desc string + activePods []*v1.Pod + shutdownGracePeriodRequested time.Duration + shutdownGracePeriodCriticalPods time.Duration + systemInhibitDelay time.Duration + overrideSystemInhibitDelay time.Duration + expectedDidOverrideInhibitDelay bool + expectedPodToGracePeriodOverride map[string]int64 + expectedError error + }{ + { + desc: "no override (total=30s, critical=10s)", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod}, + shutdownGracePeriodRequested: time.Duration(30 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second), + systemInhibitDelay: time.Duration(40 * time.Second), + overrideSystemInhibitDelay: time.Duration(40 * time.Second), + expectedDidOverrideInhibitDelay: false, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10}, + }, + { + desc: "no override (total=30s, critical=10s) pods with terminationGracePeriod and without", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod, normalPodGracePeriod, criticalPodGracePeriod}, + shutdownGracePeriodRequested: time.Duration(30 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second), + systemInhibitDelay: time.Duration(40 * time.Second), + overrideSystemInhibitDelay: time.Duration(40 * time.Second), + expectedDidOverrideInhibitDelay: false, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10, "normal-pod-grace-period": 2, "critical-pod-grace-period": 2}, + }, + { + desc: "no override (total=30s, critical=10s) pod with long terminationGracePeriod is overridden", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod, normalPodGracePeriod, criticalPodGracePeriod, normalPodLongGracePeriod}, + shutdownGracePeriodRequested: time.Duration(30 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second), + systemInhibitDelay: time.Duration(40 * time.Second), + overrideSystemInhibitDelay: time.Duration(40 * time.Second), + expectedDidOverrideInhibitDelay: false, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10, "normal-pod-grace-period": 2, "critical-pod-grace-period": 2, "normal-pod-long-grace-period": 20}, + }, + { + desc: "no override (total=30, critical=0)", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod}, + shutdownGracePeriodRequested: time.Duration(30 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(0 * time.Second), + systemInhibitDelay: time.Duration(40 * time.Second), + overrideSystemInhibitDelay: time.Duration(40 * time.Second), + expectedDidOverrideInhibitDelay: false, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 30, "critical-pod-nil-grace-period": 0}, + }, + { + desc: "override successful (total=30, critical=10)", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod}, + shutdownGracePeriodRequested: time.Duration(30 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second), + systemInhibitDelay: time.Duration(5 * time.Second), + overrideSystemInhibitDelay: time.Duration(30 * time.Second), + expectedDidOverrideInhibitDelay: true, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 20, "critical-pod-nil-grace-period": 10}, + }, + { + desc: "override unsuccessful", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod}, + shutdownGracePeriodRequested: time.Duration(30 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(10 * time.Second), + systemInhibitDelay: time.Duration(5 * time.Second), + overrideSystemInhibitDelay: time.Duration(5 * time.Second), + expectedDidOverrideInhibitDelay: true, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 5, "critical-pod-nil-grace-period": 0}, + expectedError: fmt.Errorf("unable to update logind InhibitDelayMaxSec to 30s (ShutdownGracePeriod), current value of InhibitDelayMaxSec (5s) is less than requested ShutdownGracePeriod"), + }, + { + desc: "override unsuccessful, zero time", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod}, + shutdownGracePeriodRequested: time.Duration(5 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(5 * time.Second), + systemInhibitDelay: time.Duration(0 * time.Second), + overrideSystemInhibitDelay: time.Duration(0 * time.Second), + expectedError: fmt.Errorf("unable to update logind InhibitDelayMaxSec to 5s (ShutdownGracePeriod), current value of InhibitDelayMaxSec (0s) is less than requested ShutdownGracePeriod"), + }, + { + desc: "no override, all time to critical pods", + activePods: []*v1.Pod{normalPodNoGracePeriod, criticalPodNoGracePeriod}, + shutdownGracePeriodRequested: time.Duration(5 * time.Second), + shutdownGracePeriodCriticalPods: time.Duration(5 * time.Second), + systemInhibitDelay: time.Duration(5 * time.Second), + overrideSystemInhibitDelay: time.Duration(5 * time.Second), + expectedDidOverrideInhibitDelay: false, + expectedPodToGracePeriodOverride: map[string]int64{"normal-pod-nil-grace-period": 0, "critical-pod-nil-grace-period": 5}, + }, + } + + for _, tc := range tests { + t.Run(tc.desc, func(t *testing.T) { + activePodsFunc := func() []*v1.Pod { + return tc.activePods + } + + type PodKillInfo struct { + Name string + GracePeriod int64 + } + + podKillChan := make(chan PodKillInfo) + killPodsFunc := func(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error { + var gracePeriod int64 + if gracePeriodOverride != nil { + gracePeriod = *gracePeriodOverride + } + podKillChan <- PodKillInfo{Name: pod.Name, GracePeriod: gracePeriod} + return nil + } + + fakeShutdownChan := make(chan bool) + fakeDbus := &fakeDbus{currentInhibitDelay: tc.systemInhibitDelay, shutdownChan: fakeShutdownChan, overrideSystemInhibitDelay: tc.overrideSystemInhibitDelay} + systemDbus = func() (dbusInhibiter, error) { + return fakeDbus, nil + } + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, pkgfeatures.GracefulNodeShutdown, true)() + + manager := NewManager(activePodsFunc, killPodsFunc, tc.shutdownGracePeriodRequested, tc.shutdownGracePeriodCriticalPods) + manager.clock = clock.NewFakeClock(time.Now()) + + err := manager.Start() + if tc.expectedError != nil { + if !strings.Contains(err.Error(), tc.expectedError.Error()) { + t.Errorf("unexpected error message. Got: %s want %s", err.Error(), tc.expectedError.Error()) + } + } else { + assert.NoError(t, err, "expected manager.Start() to not return error") + assert.True(t, fakeDbus.didInhibitShutdown, "expected that manager inhibited shutdown") + assert.NoError(t, manager.ShutdownStatus(), "expected that manager does not return error since shutdown is not active") + + // Send fake shutdown event + fakeShutdownChan <- true + + // Wait for all the pods to be killed + killedPodsToGracePeriods := map[string]int64{} + for i := 0; i < len(tc.activePods); i++ { + select { + case podKillInfo := <-podKillChan: + killedPodsToGracePeriods[podKillInfo.Name] = podKillInfo.GracePeriod + continue + case <-time.After(1 * time.Second): + t.Fatal() + } + } + + assert.Error(t, manager.ShutdownStatus(), "expected that manager returns error since shutdown is active") + assert.Equal(t, tc.expectedPodToGracePeriodOverride, killedPodsToGracePeriods) + assert.Equal(t, tc.expectedDidOverrideInhibitDelay, fakeDbus.didOverrideInhibitDelay, "override system inhibit delay differs") + } + }) + } +} diff --git a/pkg/kubelet/nodeshutdown/nodeshutdown_manager_others.go b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_others.go new file mode 100644 index 00000000000..42ea8ba83f3 --- /dev/null +++ b/pkg/kubelet/nodeshutdown/nodeshutdown_manager_others.go @@ -0,0 +1,43 @@ +// +build !linux + +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeshutdown + +import ( + "time" + + "k8s.io/kubernetes/pkg/kubelet/eviction" +) + +// Manager is a fake node shutdown manager for non linux platforms. +type Manager struct{} + +// NewManager returns a fake node shutdown manager for non linux platforms. +func NewManager(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, shutdownGracePeriodRequested, shutdownGracePeriodCriticalPods time.Duration) *Manager { + return &Manager{} +} + +// Start is a no-op always returning nil for non linux platforms. +func (m *Manager) Start() error { + return nil +} + +// ShutdownStatus is a no-op always returning nil for non linux platforms. +func (m *Manager) ShutdownStatus() error { + return nil +} diff --git a/pkg/kubelet/nodestatus/setters.go b/pkg/kubelet/nodestatus/setters.go index ad824d638ee..71275f1510b 100644 --- a/pkg/kubelet/nodestatus/setters.go +++ b/pkg/kubelet/nodestatus/setters.go @@ -498,6 +498,7 @@ func ReadyCondition( storageErrorsFunc func() error, // typically Kubelet.runtimeState.storageErrors appArmorValidateHostFunc func() error, // typically Kubelet.appArmorValidator.ValidateHost, might be nil depending on whether there was an appArmorValidator cmStatusFunc func() cm.Status, // typically Kubelet.containerManager.Status + nodeShutdownManagerErrorsFunc func() error, // typically kubelet.shutdownManager.errors. recordEventFunc func(eventType, event string), // typically Kubelet.recordNodeStatusEvent ) Setter { return func(node *v1.Node) error { @@ -512,7 +513,7 @@ func ReadyCondition( Message: "kubelet is posting ready status", LastHeartbeatTime: currentTime, } - errs := []error{runtimeErrorsFunc(), networkErrorsFunc(), storageErrorsFunc()} + errs := []error{runtimeErrorsFunc(), networkErrorsFunc(), storageErrorsFunc(), nodeShutdownManagerErrorsFunc()} requiredCapacities := []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods} if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { requiredCapacities = append(requiredCapacities, v1.ResourceEphemeralStorage) diff --git a/pkg/kubelet/nodestatus/setters_test.go b/pkg/kubelet/nodestatus/setters_test.go index 80d3223904c..0547446adba 100644 --- a/pkg/kubelet/nodestatus/setters_test.go +++ b/pkg/kubelet/nodestatus/setters_test.go @@ -1109,15 +1109,16 @@ func TestReadyCondition(t *testing.T) { } cases := []struct { - desc string - node *v1.Node - runtimeErrors error - networkErrors error - storageErrors error - appArmorValidateHostFunc func() error - cmStatus cm.Status - expectConditions []v1.NodeCondition - expectEvents []testEvent + desc string + node *v1.Node + runtimeErrors error + networkErrors error + storageErrors error + appArmorValidateHostFunc func() error + cmStatus cm.Status + nodeShutdownManagerErrors error + expectConditions []v1.NodeCondition + expectEvents []testEvent }{ { desc: "new, ready", @@ -1154,6 +1155,12 @@ func TestReadyCondition(t *testing.T) { storageErrors: errors.New("some storage error"), expectConditions: []v1.NodeCondition{*makeReadyCondition(false, "some storage error", now, now)}, }, + { + desc: "new, not ready: shutdown active", + node: withCapacity.DeepCopy(), + nodeShutdownManagerErrors: errors.New("node is shutting down"), + expectConditions: []v1.NodeCondition{*makeReadyCondition(false, "node is shutting down", now, now)}, + }, { desc: "new, not ready: runtime and network errors", node: withCapacity.DeepCopy(), @@ -1234,6 +1241,9 @@ func TestReadyCondition(t *testing.T) { cmStatusFunc := func() cm.Status { return tc.cmStatus } + nodeShutdownErrorsFunc := func() error { + return tc.nodeShutdownManagerErrors + } events := []testEvent{} recordEventFunc := func(eventType, event string) { events = append(events, testEvent{ @@ -1242,7 +1252,7 @@ func TestReadyCondition(t *testing.T) { }) } // construct setter - setter := ReadyCondition(nowFunc, runtimeErrorsFunc, networkErrorsFunc, storageErrorsFunc, tc.appArmorValidateHostFunc, cmStatusFunc, recordEventFunc) + setter := ReadyCondition(nowFunc, runtimeErrorsFunc, networkErrorsFunc, storageErrorsFunc, tc.appArmorValidateHostFunc, cmStatusFunc, nodeShutdownErrorsFunc, recordEventFunc) // call setter on node if err := setter(tc.node); err != nil { t.Fatalf("unexpected error: %v", err) diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index ff3dc464f44..2c35b1f2403 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -815,6 +815,15 @@ type KubeletConfiguration struct { // Default: true // +optional EnableSystemLogHandler *bool `json:"enableSystemLogHandler,omitempty"` + // ShutdownGracePeriod specifies the total duration that the node should delay the shutdown and total grace period for pod termination during a node shutdown. + // Default: "30s" + // +optional + ShutdownGracePeriod metav1.Duration `json:"shutdownGracePeriod,omitempty"` + // ShutdownGracePeriodCriticalPods specifies the duration used to terminate critical pods during a node shutdown. This should be less than ShutdownGracePeriod. + // For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods. + // Default: "10s" + // +optional + ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"` } type KubeletAuthorizationMode string diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go index fd478f82050..a6ad075c9ad 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go @@ -301,6 +301,8 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { *out = new(bool) **out = **in } + out.ShutdownGracePeriod = in.ShutdownGracePeriod + out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods return }