Implement shutdown manager in kubelet

Implements KEP 2000, Graceful Node Shutdown:
https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2000-graceful-node-shutdown

* Add new FeatureGate `GracefulNodeShutdown` to control
enabling/disabling the feature
* Add two new KubeletConfiguration options
  * `ShutdownGracePeriod` and `ShutdownGracePeriodCriticalPods`
* Add new package, `nodeshutdown` that implements the Node shutdown
manager
  * The node shutdown manager uses the systemd inhibit package, to
  create an system inhibitor, monitor for node shutdown events, and
  gracefully terminate pods upon a node shutdown.
This commit is contained in:
David Porter
2020-11-02 23:18:36 +00:00
parent 2343689ce7
commit 16f71c6d47
20 changed files with 896 additions and 124 deletions

View File

@@ -234,5 +234,7 @@ var (
"TypeMeta.Kind",
"VolumeStatsAggPeriod.Duration",
"VolumePluginDir",
"ShutdownGracePeriod.Duration",
"ShutdownGracePeriodCriticalPods.Duration",
)
)

View File

@@ -67,6 +67,8 @@ registryPullQPS: 5
resolvConf: /etc/resolv.conf
runtimeRequestTimeout: 2m0s
serializeImagePulls: true
shutdownGracePeriod: 0s
shutdownGracePeriodCriticalPods: 0s
streamingConnectionIdleTimeout: 4h0m0s
syncFrequency: 1m0s
topologyManagerPolicy: none

View File

@@ -67,6 +67,8 @@ registryPullQPS: 5
resolvConf: /etc/resolv.conf
runtimeRequestTimeout: 2m0s
serializeImagePulls: true
shutdownGracePeriod: 0s
shutdownGracePeriodCriticalPods: 0s
streamingConnectionIdleTimeout: 4h0m0s
syncFrequency: 1m0s
topologyManagerPolicy: none

View File

@@ -375,6 +375,13 @@ type KubeletConfiguration struct {
Logging componentbaseconfig.LoggingConfiguration
// EnableSystemLogHandler enables /logs handler.
EnableSystemLogHandler bool
// ShutdownGracePeriod specifies the total duration that the node should delay the shutdown and total grace period for pod termination during a node shutdown.
// Defaults to 30 seconds, requires GracefulNodeShutdown feature gate to be enabled.
ShutdownGracePeriod metav1.Duration
// ShutdownGracePeriodCriticalPods specifies the duration used to terminate critical pods during a node shutdown. This should be less than ShutdownGracePeriod.
// Defaults to 10 seconds, requires GracefulNodeShutdown feature gate to be enabled.
// For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods.
ShutdownGracePeriodCriticalPods metav1.Duration
}
// KubeletAuthorizationMode denotes the authorization mode for the kubelet

View File

@@ -350,6 +350,8 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in
if err := v1.Convert_Pointer_bool_To_bool(&in.EnableSystemLogHandler, &out.EnableSystemLogHandler, s); err != nil {
return err
}
out.ShutdownGracePeriod = in.ShutdownGracePeriod
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
return nil
}
@@ -501,6 +503,8 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in
if err := v1.Convert_bool_To_Pointer_bool(&in.EnableSystemLogHandler, &out.EnableSystemLogHandler, s); err != nil {
return err
}
out.ShutdownGracePeriod = in.ShutdownGracePeriod
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
return nil
}

View File

@@ -140,6 +140,21 @@ func ValidateKubeletConfiguration(kc *kubeletconfig.KubeletConfiguration) error
allErrors = append(allErrors, fmt.Errorf("invalid configuration: topologyManagerScope non-allowable value: %v", kc.TopologyManagerScope))
}
if localFeatureGate.Enabled(features.GracefulNodeShutdown) {
if kc.ShutdownGracePeriod.Duration < 0 || kc.ShutdownGracePeriodCriticalPods.Duration < 0 || kc.ShutdownGracePeriodCriticalPods.Duration > kc.ShutdownGracePeriod.Duration {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriod %v must be >= 0, ShutdownGracePeriodCriticalPods %v must be >= 0, and ShutdownGracePeriodCriticalPods %v must be <= ShutdownGracePeriod %v", kc.ShutdownGracePeriod, kc.ShutdownGracePeriodCriticalPods, kc.ShutdownGracePeriodCriticalPods, kc.ShutdownGracePeriod))
}
if kc.ShutdownGracePeriod.Duration > 0 && kc.ShutdownGracePeriod.Duration < time.Duration(time.Second) {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriod %v must be either zero or otherwise >= 1 sec", kc.ShutdownGracePeriod))
}
if kc.ShutdownGracePeriodCriticalPods.Duration > 0 && kc.ShutdownGracePeriodCriticalPods.Duration < time.Duration(time.Second) {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: ShutdownGracePeriodCriticalPods %v must be either zero or otherwise >= 1 sec", kc.ShutdownGracePeriodCriticalPods))
}
}
if (kc.ShutdownGracePeriod.Duration > 0 || kc.ShutdownGracePeriodCriticalPods.Duration > 0) && !localFeatureGate.Enabled(features.GracefulNodeShutdown) {
allErrors = append(allErrors, fmt.Errorf("invalid configuration: Specifying ShutdownGracePeriod or ShutdownGracePeriodCriticalPods requires feature gate GracefulNodeShutdown"))
}
for _, val := range kc.EnforceNodeAllocatable {
switch val {
case kubetypes.NodeAllocatableEnforcementKey:

View File

@@ -27,36 +27,39 @@ import (
func TestValidateKubeletConfiguration(t *testing.T) {
successCase1 := &kubeletconfig.KubeletConfiguration{
CgroupsPerQOS: true,
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
SystemReservedCgroup: "/system.slice",
KubeReservedCgroup: "/kubelet.service",
SystemCgroups: "",
CgroupRoot: "",
EventBurst: 10,
EventRecordQPS: 5,
HealthzPort: 10248,
ImageGCHighThresholdPercent: 85,
ImageGCLowThresholdPercent: 80,
IPTablesDropBit: 15,
IPTablesMasqueradeBit: 14,
KubeAPIBurst: 10,
KubeAPIQPS: 5,
MaxOpenFiles: 1000000,
MaxPods: 110,
OOMScoreAdj: -999,
PodsPerCore: 100,
Port: 65535,
ReadOnlyPort: 0,
RegistryBurst: 10,
RegistryPullQPS: 5,
HairpinMode: kubeletconfig.PromiscuousBridge,
NodeLeaseDurationSeconds: 1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 25 * time.Millisecond},
TopologyManagerScope: kubeletconfig.PodTopologyManagerScope,
TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
CgroupsPerQOS: true,
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
SystemReservedCgroup: "/system.slice",
KubeReservedCgroup: "/kubelet.service",
SystemCgroups: "",
CgroupRoot: "",
EventBurst: 10,
EventRecordQPS: 5,
HealthzPort: 10248,
ImageGCHighThresholdPercent: 85,
ImageGCLowThresholdPercent: 80,
IPTablesDropBit: 15,
IPTablesMasqueradeBit: 14,
KubeAPIBurst: 10,
KubeAPIQPS: 5,
MaxOpenFiles: 1000000,
MaxPods: 110,
OOMScoreAdj: -999,
PodsPerCore: 100,
Port: 65535,
ReadOnlyPort: 0,
RegistryBurst: 10,
RegistryPullQPS: 5,
HairpinMode: kubeletconfig.PromiscuousBridge,
NodeLeaseDurationSeconds: 1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 25 * time.Millisecond},
TopologyManagerScope: kubeletconfig.PodTopologyManagerScope,
TopologyManagerPolicy: kubeletconfig.SingleNumaNodeTopologyManagerPolicy,
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
},
}
if allErrors := ValidateKubeletConfiguration(successCase1); allErrors != nil {
@@ -64,37 +67,40 @@ func TestValidateKubeletConfiguration(t *testing.T) {
}
successCase2 := &kubeletconfig.KubeletConfiguration{
CgroupsPerQOS: true,
EnforceNodeAllocatable: []string{"pods"},
SystemReservedCgroup: "",
KubeReservedCgroup: "",
SystemCgroups: "",
CgroupRoot: "",
EventBurst: 10,
EventRecordQPS: 5,
HealthzPort: 10248,
ImageGCHighThresholdPercent: 85,
ImageGCLowThresholdPercent: 80,
IPTablesDropBit: 15,
IPTablesMasqueradeBit: 14,
KubeAPIBurst: 10,
KubeAPIQPS: 5,
MaxOpenFiles: 1000000,
MaxPods: 110,
OOMScoreAdj: -999,
PodsPerCore: 100,
Port: 65535,
ReadOnlyPort: 0,
RegistryBurst: 10,
RegistryPullQPS: 5,
HairpinMode: kubeletconfig.PromiscuousBridge,
NodeLeaseDurationSeconds: 1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
ReservedSystemCPUs: "0-3",
TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope,
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
CgroupsPerQOS: true,
EnforceNodeAllocatable: []string{"pods"},
SystemReservedCgroup: "",
KubeReservedCgroup: "",
SystemCgroups: "",
CgroupRoot: "",
EventBurst: 10,
EventRecordQPS: 5,
HealthzPort: 10248,
ImageGCHighThresholdPercent: 85,
ImageGCLowThresholdPercent: 80,
IPTablesDropBit: 15,
IPTablesMasqueradeBit: 14,
KubeAPIBurst: 10,
KubeAPIQPS: 5,
MaxOpenFiles: 1000000,
MaxPods: 110,
OOMScoreAdj: -999,
PodsPerCore: 100,
Port: 65535,
ReadOnlyPort: 0,
RegistryBurst: 10,
RegistryPullQPS: 5,
HairpinMode: kubeletconfig.PromiscuousBridge,
NodeLeaseDurationSeconds: 1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
ReservedSystemCPUs: "0-3",
TopologyManagerScope: kubeletconfig.ContainerTopologyManagerScope,
TopologyManagerPolicy: kubeletconfig.NoneTopologyManagerPolicy,
ShutdownGracePeriod: metav1.Duration{Duration: 10 * time.Minute},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 0},
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
},
}
if allErrors := ValidateKubeletConfiguration(successCase2); allErrors != nil {
@@ -102,68 +108,73 @@ func TestValidateKubeletConfiguration(t *testing.T) {
}
errorCase1 := &kubeletconfig.KubeletConfiguration{
CgroupsPerQOS: false,
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved", "illegal-key"},
SystemCgroups: "/",
CgroupRoot: "",
EventBurst: -10,
EventRecordQPS: -10,
HealthzPort: -10,
ImageGCHighThresholdPercent: 101,
ImageGCLowThresholdPercent: 101,
IPTablesDropBit: -10,
IPTablesMasqueradeBit: -10,
KubeAPIBurst: -10,
KubeAPIQPS: -10,
MaxOpenFiles: -10,
MaxPods: -10,
OOMScoreAdj: -1001,
PodsPerCore: -10,
Port: 0,
ReadOnlyPort: -10,
RegistryBurst: -10,
RegistryPullQPS: -10,
HairpinMode: "foo",
NodeLeaseDurationSeconds: -1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond},
CgroupsPerQOS: false,
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved", "illegal-key"},
SystemCgroups: "/",
CgroupRoot: "",
EventBurst: -10,
EventRecordQPS: -10,
HealthzPort: -10,
ImageGCHighThresholdPercent: 101,
ImageGCLowThresholdPercent: 101,
IPTablesDropBit: -10,
IPTablesMasqueradeBit: -10,
KubeAPIBurst: -10,
KubeAPIQPS: -10,
MaxOpenFiles: -10,
MaxPods: -10,
OOMScoreAdj: -1001,
PodsPerCore: -10,
Port: 0,
ReadOnlyPort: -10,
RegistryBurst: -10,
RegistryPullQPS: -10,
HairpinMode: "foo",
NodeLeaseDurationSeconds: -1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 100 * time.Millisecond},
ShutdownGracePeriod: metav1.Duration{Duration: 30 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
}
const numErrsErrorCase1 = 27
const numErrsErrorCase1 = 28
if allErrors := ValidateKubeletConfiguration(errorCase1); len(allErrors.(utilerrors.Aggregate).Errors()) != numErrsErrorCase1 {
t.Errorf("expect %d errors, got %v", numErrsErrorCase1, len(allErrors.(utilerrors.Aggregate).Errors()))
}
errorCase2 := &kubeletconfig.KubeletConfiguration{
CgroupsPerQOS: true,
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
SystemReservedCgroup: "/system.slice",
KubeReservedCgroup: "/kubelet.service",
SystemCgroups: "",
CgroupRoot: "",
EventBurst: 10,
EventRecordQPS: 5,
HealthzPort: 10248,
ImageGCHighThresholdPercent: 85,
ImageGCLowThresholdPercent: 80,
IPTablesDropBit: 15,
IPTablesMasqueradeBit: 14,
KubeAPIBurst: 10,
KubeAPIQPS: 5,
MaxOpenFiles: 1000000,
MaxPods: 110,
OOMScoreAdj: -999,
PodsPerCore: 100,
Port: 65535,
ReadOnlyPort: 0,
RegistryBurst: 10,
RegistryPullQPS: 5,
HairpinMode: kubeletconfig.PromiscuousBridge,
NodeLeaseDurationSeconds: 1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
ReservedSystemCPUs: "0-3",
TopologyManagerScope: "invalid",
TopologyManagerPolicy: "invalid",
CgroupsPerQOS: true,
EnforceNodeAllocatable: []string{"pods", "system-reserved", "kube-reserved"},
SystemReservedCgroup: "/system.slice",
KubeReservedCgroup: "/kubelet.service",
SystemCgroups: "",
CgroupRoot: "",
EventBurst: 10,
EventRecordQPS: 5,
HealthzPort: 10248,
ImageGCHighThresholdPercent: 85,
ImageGCLowThresholdPercent: 80,
IPTablesDropBit: 15,
IPTablesMasqueradeBit: 14,
KubeAPIBurst: 10,
KubeAPIQPS: 5,
MaxOpenFiles: 1000000,
MaxPods: 110,
OOMScoreAdj: -999,
PodsPerCore: 100,
Port: 65535,
ReadOnlyPort: 0,
RegistryBurst: 10,
RegistryPullQPS: 5,
HairpinMode: kubeletconfig.PromiscuousBridge,
NodeLeaseDurationSeconds: 1,
CPUCFSQuotaPeriod: metav1.Duration{Duration: 50 * time.Millisecond},
ReservedSystemCPUs: "0-3",
TopologyManagerScope: "invalid",
TopologyManagerPolicy: "invalid",
ShutdownGracePeriod: metav1.Duration{Duration: 40 * time.Second},
ShutdownGracePeriodCriticalPods: metav1.Duration{Duration: 10 * time.Second},
FeatureGates: map[string]bool{
"CustomCPUCFSQuotaPeriod": true,
"GracefulNodeShutdown": true,
},
}
const numErrsErrorCase2 = 3

View File

@@ -271,6 +271,8 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
copy(*out, *in)
}
out.Logging = in.Logging
out.ShutdownGracePeriod = in.ShutdownGracePeriod
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
return
}