mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Merge pull request #37228 from sjenning/teardown-terminated-volumes
Automatic merge from submit-queue (batch tested with PRs 37228, 40146, 40075, 38789, 40189) kubelet: storage: teardown terminated pod volumes This is a continuation of the work done in https://github.com/kubernetes/kubernetes/pull/36779 There really is no reason to keep volumes for terminated pods attached on the node. This PR extends the removal of volumes on the node from memory-backed (the current policy) to all volumes. @pmorie raised a concern an impact debugging volume related issues if terminated pod volumes are removed. To address this issue, the PR adds a `--keep-terminated-pod-volumes` flag the kubelet and sets it for `hack/local-up-cluster.sh`. For consideration in 1.6. Fixes #35406 @derekwaynecarr @vishh @dashpole ```release-note kubelet tears down pod volumes on pod termination rather than pod deletion ```
This commit is contained in:
commit
dcf14add92
@ -251,6 +251,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
|
|||||||
fs.BoolVar(&s.ExperimentalKernelMemcgNotification, "experimental-kernel-memcg-notification", s.ExperimentalKernelMemcgNotification, "If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling.")
|
fs.BoolVar(&s.ExperimentalKernelMemcgNotification, "experimental-kernel-memcg-notification", s.ExperimentalKernelMemcgNotification, "If enabled, the kubelet will integrate with the kernel memcg notification to determine if memory eviction thresholds are crossed rather than polling.")
|
||||||
fs.Int32Var(&s.PodsPerCore, "pods-per-core", s.PodsPerCore, "Number of Pods per core that can run on this Kubelet. The total number of Pods on this Kubelet cannot exceed max-pods, so max-pods will be used if this calculation results in a larger number of Pods allowed on the Kubelet. A value of 0 disables this limit.")
|
fs.Int32Var(&s.PodsPerCore, "pods-per-core", s.PodsPerCore, "Number of Pods per core that can run on this Kubelet. The total number of Pods on this Kubelet cannot exceed max-pods, so max-pods will be used if this calculation results in a larger number of Pods allowed on the Kubelet. A value of 0 disables this limit.")
|
||||||
fs.BoolVar(&s.ProtectKernelDefaults, "protect-kernel-defaults", s.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.")
|
fs.BoolVar(&s.ProtectKernelDefaults, "protect-kernel-defaults", s.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.")
|
||||||
|
fs.BoolVar(&s.KeepTerminatedPodVolumes, "keep-terminated-pod-volumes", s.KeepTerminatedPodVolumes, "Keep terminated pod volumes mounted to the node after the pod terminates. Can be useful for debugging volume related issues.")
|
||||||
|
|
||||||
// CRI flags.
|
// CRI flags.
|
||||||
fs.BoolVar(&s.EnableCRI, "experimental-cri", s.EnableCRI, "[Experimental] Enable the Container Runtime Interface (CRI) integration. If --container-runtime is set to \"remote\", Kubelet will communicate with the runtime/image CRI server listening on the endpoint specified by --remote-runtime-endpoint/--remote-image-endpoint. If --container-runtime is set to \"docker\", Kubelet will launch a in-process CRI server on behalf of docker, and communicate over a default endpoint.")
|
fs.BoolVar(&s.EnableCRI, "experimental-cri", s.EnableCRI, "[Experimental] Enable the Container Runtime Interface (CRI) integration. If --container-runtime is set to \"remote\", Kubelet will communicate with the runtime/image CRI server listening on the endpoint specified by --remote-runtime-endpoint/--remote-image-endpoint. If --container-runtime is set to \"docker\", Kubelet will launch a in-process CRI server on behalf of docker, and communicate over a default endpoint.")
|
||||||
|
@ -536,6 +536,7 @@ function start_kubelet {
|
|||||||
--experimental-cgroups-per-qos=${EXPERIMENTAL_CGROUPS_PER_QOS} \
|
--experimental-cgroups-per-qos=${EXPERIMENTAL_CGROUPS_PER_QOS} \
|
||||||
--cgroup-driver=${CGROUP_DRIVER} \
|
--cgroup-driver=${CGROUP_DRIVER} \
|
||||||
--cgroup-root=${CGROUP_ROOT} \
|
--cgroup-root=${CGROUP_ROOT} \
|
||||||
|
--keep-terminated-pod-volumes=true \
|
||||||
${auth_args} \
|
${auth_args} \
|
||||||
${dns_args} \
|
${dns_args} \
|
||||||
${net_plugin_dir_args} \
|
${net_plugin_dir_args} \
|
||||||
|
@ -308,6 +308,7 @@ junit-file-number
|
|||||||
k8s-bin-dir
|
k8s-bin-dir
|
||||||
k8s-build-output
|
k8s-build-output
|
||||||
keep-gogoproto
|
keep-gogoproto
|
||||||
|
keep-terminated-pod-volumes
|
||||||
km-path
|
km-path
|
||||||
kops-admin-access
|
kops-admin-access
|
||||||
kops-cluster
|
kops-cluster
|
||||||
|
@ -481,6 +481,9 @@ type KubeletConfiguration struct {
|
|||||||
// (binaries, etc.) to mount the volume are available on the underlying node. If the check is enabled
|
// (binaries, etc.) to mount the volume are available on the underlying node. If the check is enabled
|
||||||
// and fails the mount operation fails.
|
// and fails the mount operation fails.
|
||||||
ExperimentalCheckNodeCapabilitiesBeforeMount bool
|
ExperimentalCheckNodeCapabilitiesBeforeMount bool
|
||||||
|
// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
|
||||||
|
// This can be useful for debugging volume related issues.
|
||||||
|
KeepTerminatedPodVolumes bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type KubeletAuthorizationMode string
|
type KubeletAuthorizationMode string
|
||||||
|
@ -521,6 +521,9 @@ type KubeletConfiguration struct {
|
|||||||
// (binaries, etc.) to mount the volume are available on the underlying node. If the check is enabled
|
// (binaries, etc.) to mount the volume are available on the underlying node. If the check is enabled
|
||||||
// and fails the mount operation fails.
|
// and fails the mount operation fails.
|
||||||
ExperimentalCheckNodeCapabilitiesBeforeMount bool `json:"experimentalCheckNodeCapabilitiesBeforeMount,omitempty"`
|
ExperimentalCheckNodeCapabilitiesBeforeMount bool `json:"experimentalCheckNodeCapabilitiesBeforeMount,omitempty"`
|
||||||
|
// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
|
||||||
|
// This can be useful for debugging volume related issues.
|
||||||
|
KeepTerminatedPodVolumes bool `json:"keepTerminatedPodVolumes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type KubeletAuthorizationMode string
|
type KubeletAuthorizationMode string
|
||||||
|
@ -481,6 +481,7 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
|
|||||||
out.EnableCRI = in.EnableCRI
|
out.EnableCRI = in.EnableCRI
|
||||||
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
|
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
|
||||||
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
|
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
|
||||||
|
out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -652,6 +653,7 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
|
|||||||
out.EnableCRI = in.EnableCRI
|
out.EnableCRI = in.EnableCRI
|
||||||
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
|
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
|
||||||
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
|
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
|
||||||
|
out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10431,6 +10431,13 @@ var OpenAPIDefinitions *openapi.OpenAPIDefinitions = &openapi.OpenAPIDefinitions
|
|||||||
Format: "",
|
Format: "",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"keepTerminatedPodVolumes": {
|
||||||
|
SchemaProps: spec.SchemaProps{
|
||||||
|
Description: "This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node. This can be useful for debugging volume related issues.",
|
||||||
|
Type: []string{"boolean"},
|
||||||
|
Format: "",
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "systemReserved", "kubeReserved", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit"},
|
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "systemReserved", "kubeReserved", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit"},
|
||||||
},
|
},
|
||||||
|
@ -747,7 +747,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
|
|||||||
kubeDeps.Mounter,
|
kubeDeps.Mounter,
|
||||||
klet.getPodsDir(),
|
klet.getPodsDir(),
|
||||||
kubeDeps.Recorder,
|
kubeDeps.Recorder,
|
||||||
kubeCfg.ExperimentalCheckNodeCapabilitiesBeforeMount)
|
kubeCfg.ExperimentalCheckNodeCapabilitiesBeforeMount,
|
||||||
|
kubeCfg.KeepTerminatedPodVolumes)
|
||||||
|
|
||||||
runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime)
|
runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -257,7 +257,8 @@ func newTestKubeletWithImageList(
|
|||||||
kubelet.mounter,
|
kubelet.mounter,
|
||||||
kubelet.getPodsDir(),
|
kubelet.getPodsDir(),
|
||||||
kubelet.recorder,
|
kubelet.recorder,
|
||||||
false /* experimentalCheckNodeCapabilitiesBeforeMount*/)
|
false, /* experimentalCheckNodeCapabilitiesBeforeMount*/
|
||||||
|
false /* keepTerminatedPodVolumes */)
|
||||||
require.NoError(t, err, "Failed to initialize volume manager")
|
require.NoError(t, err, "Failed to initialize volume manager")
|
||||||
|
|
||||||
// enable active deadline handler
|
// enable active deadline handler
|
||||||
|
@ -106,7 +106,8 @@ func TestRunOnce(t *testing.T) {
|
|||||||
kb.mounter,
|
kb.mounter,
|
||||||
kb.getPodsDir(),
|
kb.getPodsDir(),
|
||||||
kb.recorder,
|
kb.recorder,
|
||||||
false /* experimentalCheckNodeCapabilitiesBeforeMount */)
|
false, /* experimentalCheckNodeCapabilitiesBeforeMount */
|
||||||
|
false /* keepTerminatedPodVolumes */)
|
||||||
|
|
||||||
kb.networkPlugin, _ = network.InitNetworkPlugin([]network.NetworkPlugin{}, "", nettest.NewFakeHost(nil), componentconfig.HairpinNone, kb.nonMasqueradeCIDR, network.UseDefaultMTU)
|
kb.networkPlugin, _ = network.InitNetworkPlugin([]network.NetworkPlugin{}, "", nettest.NewFakeHost(nil), componentconfig.HairpinNone, kb.nonMasqueradeCIDR, network.UseDefaultMTU)
|
||||||
// TODO: Factor out "StatsProvider" from Kubelet so we don't have a cyclic dependency
|
// TODO: Factor out "StatsProvider" from Kubelet so we don't have a cyclic dependency
|
||||||
|
@ -71,7 +71,8 @@ func NewDesiredStateOfWorldPopulator(
|
|||||||
getPodStatusRetryDuration time.Duration,
|
getPodStatusRetryDuration time.Duration,
|
||||||
podManager pod.Manager,
|
podManager pod.Manager,
|
||||||
desiredStateOfWorld cache.DesiredStateOfWorld,
|
desiredStateOfWorld cache.DesiredStateOfWorld,
|
||||||
kubeContainerRuntime kubecontainer.Runtime) DesiredStateOfWorldPopulator {
|
kubeContainerRuntime kubecontainer.Runtime,
|
||||||
|
keepTerminatedPodVolumes bool) DesiredStateOfWorldPopulator {
|
||||||
return &desiredStateOfWorldPopulator{
|
return &desiredStateOfWorldPopulator{
|
||||||
kubeClient: kubeClient,
|
kubeClient: kubeClient,
|
||||||
loopSleepDuration: loopSleepDuration,
|
loopSleepDuration: loopSleepDuration,
|
||||||
@ -80,7 +81,8 @@ func NewDesiredStateOfWorldPopulator(
|
|||||||
desiredStateOfWorld: desiredStateOfWorld,
|
desiredStateOfWorld: desiredStateOfWorld,
|
||||||
pods: processedPods{
|
pods: processedPods{
|
||||||
processedPods: make(map[volumetypes.UniquePodName]bool)},
|
processedPods: make(map[volumetypes.UniquePodName]bool)},
|
||||||
kubeContainerRuntime: kubeContainerRuntime,
|
kubeContainerRuntime: kubeContainerRuntime,
|
||||||
|
keepTerminatedPodVolumes: keepTerminatedPodVolumes,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,6 +95,7 @@ type desiredStateOfWorldPopulator struct {
|
|||||||
pods processedPods
|
pods processedPods
|
||||||
kubeContainerRuntime kubecontainer.Runtime
|
kubeContainerRuntime kubecontainer.Runtime
|
||||||
timeOfLastGetPodStatus time.Time
|
timeOfLastGetPodStatus time.Time
|
||||||
|
keepTerminatedPodVolumes bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type processedPods struct {
|
type processedPods struct {
|
||||||
@ -160,13 +163,7 @@ func (dswp *desiredStateOfWorldPopulator) findAndRemoveDeletedPods() {
|
|||||||
if !isPodTerminated(pod) {
|
if !isPodTerminated(pod) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Skip non-memory backed volumes belonging to terminated pods
|
if dswp.keepTerminatedPodVolumes {
|
||||||
volume := volumeToMount.VolumeSpec.Volume
|
|
||||||
if volume == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if (volume.EmptyDir == nil || volume.EmptyDir.Medium != v1.StorageMediumMemory) &&
|
|
||||||
volume.ConfigMap == nil && volume.Secret == nil {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -157,7 +157,8 @@ func NewVolumeManager(
|
|||||||
mounter mount.Interface,
|
mounter mount.Interface,
|
||||||
kubeletPodsDir string,
|
kubeletPodsDir string,
|
||||||
recorder record.EventRecorder,
|
recorder record.EventRecorder,
|
||||||
checkNodeCapabilitiesBeforeMount bool) (VolumeManager, error) {
|
checkNodeCapabilitiesBeforeMount bool,
|
||||||
|
keepTerminatedPodVolumes bool) (VolumeManager, error) {
|
||||||
|
|
||||||
vm := &volumeManager{
|
vm := &volumeManager{
|
||||||
kubeClient: kubeClient,
|
kubeClient: kubeClient,
|
||||||
@ -191,7 +192,8 @@ func NewVolumeManager(
|
|||||||
desiredStateOfWorldPopulatorGetPodStatusRetryDuration,
|
desiredStateOfWorldPopulatorGetPodStatusRetryDuration,
|
||||||
podManager,
|
podManager,
|
||||||
vm.desiredStateOfWorld,
|
vm.desiredStateOfWorld,
|
||||||
kubeContainerRuntime)
|
kubeContainerRuntime,
|
||||||
|
keepTerminatedPodVolumes)
|
||||||
|
|
||||||
return vm, nil
|
return vm, nil
|
||||||
}
|
}
|
||||||
|
@ -198,7 +198,8 @@ func newTestVolumeManager(
|
|||||||
&mount.FakeMounter{},
|
&mount.FakeMounter{},
|
||||||
"",
|
"",
|
||||||
fakeRecorder,
|
fakeRecorder,
|
||||||
false /* experimentalCheckNodeCapabilitiesBeforeMount */)
|
false, /* experimentalCheckNodeCapabilitiesBeforeMount */
|
||||||
|
false /* keepTerminatedPodVolumes */)
|
||||||
|
|
||||||
return vm, err
|
return vm, err
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user