mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Merge pull request #115972 from jsafrane/add-orphan-pod-metrics
Add metric for failed orphan pod cleanup
This commit is contained in:
commit
a408be817f
@ -28,6 +28,7 @@ import (
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/klog/v2"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
"k8s.io/kubernetes/pkg/util/removeall"
|
||||
"k8s.io/kubernetes/pkg/volume"
|
||||
volumetypes "k8s.io/kubernetes/pkg/volume/util/types"
|
||||
@ -181,16 +182,21 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
||||
|
||||
orphanRemovalErrors := []error{}
|
||||
orphanVolumeErrors := []error{}
|
||||
var totalPods, errorPods int
|
||||
|
||||
for _, uid := range found {
|
||||
if allPods.Has(string(uid)) {
|
||||
continue
|
||||
}
|
||||
|
||||
totalPods++
|
||||
|
||||
// If volumes have not been unmounted/detached, do not delete directory.
|
||||
// Doing so may result in corruption of data.
|
||||
// TODO: getMountedVolumePathListFromDisk() call may be redundant with
|
||||
// kl.getPodVolumePathListFromDisk(). Can this be cleaned up?
|
||||
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
|
||||
errorPods++
|
||||
klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid)
|
||||
continue
|
||||
}
|
||||
@ -198,6 +204,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
||||
// Attempt to remove the pod volumes directory and its subdirs
|
||||
podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid)
|
||||
if len(podVolumeErrors) > 0 {
|
||||
errorPods++
|
||||
orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...)
|
||||
// Not all volumes were removed, so don't clean up the pod directory yet. It is likely
|
||||
// that there are still mountpoints or files left which could cause removal of the pod
|
||||
@ -211,10 +218,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
||||
podDir := kl.getPodDir(uid)
|
||||
podSubdirs, err := os.ReadDir(podDir)
|
||||
if err != nil {
|
||||
errorPods++
|
||||
klog.ErrorS(err, "Could not read directory", "path", podDir)
|
||||
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err))
|
||||
continue
|
||||
}
|
||||
|
||||
var cleanupFailed bool
|
||||
for _, podSubdir := range podSubdirs {
|
||||
podSubdirName := podSubdir.Name()
|
||||
podSubdirPath := filepath.Join(podDir, podSubdirName)
|
||||
@ -222,11 +232,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
||||
// as this could lead to data loss in some situations. The volumes
|
||||
// directory should have been removed by removeOrphanedPodVolumeDirs.
|
||||
if podSubdirName == "volumes" {
|
||||
cleanupFailed = true
|
||||
err := fmt.Errorf("volumes subdir was found after it was removed")
|
||||
klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath)
|
||||
continue
|
||||
}
|
||||
if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil {
|
||||
cleanupFailed = true
|
||||
klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath)
|
||||
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err))
|
||||
}
|
||||
@ -235,9 +247,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
||||
// Rmdir the pod dir, which should be empty if everything above was successful
|
||||
klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid)
|
||||
if err := syscall.Rmdir(podDir); err != nil {
|
||||
cleanupFailed = true
|
||||
klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid)
|
||||
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err))
|
||||
}
|
||||
if cleanupFailed {
|
||||
errorPods++
|
||||
}
|
||||
}
|
||||
|
||||
logSpew := func(errs []error) {
|
||||
@ -250,5 +266,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
||||
}
|
||||
logSpew(orphanVolumeErrors)
|
||||
logSpew(orphanRemovalErrors)
|
||||
metrics.OrphanPodCleanedVolumes.Set(float64(totalPods))
|
||||
metrics.OrphanPodCleanedVolumesErrors.Set(float64(errorPods))
|
||||
return utilerrors.NewAggregate(orphanRemovalErrors)
|
||||
}
|
||||
|
@ -102,6 +102,10 @@ const (
|
||||
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
||||
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"
|
||||
|
||||
// Metrics to track orphan pod cleanup
|
||||
orphanPodCleanedVolumesKey = "orphan_pod_cleaned_volumes"
|
||||
orphanPodCleanedVolumesErrorsKey = "orphan_pod_cleaned_volumes_errors"
|
||||
|
||||
// Values used in metric labels
|
||||
Container = "container"
|
||||
InitContainer = "init_container"
|
||||
@ -649,6 +653,25 @@ var (
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// OrphanPodCleanedVolumes is number of orphaned Pods that times that removeOrphanedPodVolumeDirs was called during the last sweep.
|
||||
OrphanPodCleanedVolumes = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: orphanPodCleanedVolumesKey,
|
||||
Help: "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
// OrphanPodCleanedVolumes is number of times that removeOrphanedPodVolumeDirs failed.
|
||||
OrphanPodCleanedVolumesErrors = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: orphanPodCleanedVolumesErrorsKey,
|
||||
Help: "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
@ -709,6 +732,8 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
||||
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
|
||||
legacyregistry.MustRegister(OrphanPodCleanedVolumes)
|
||||
legacyregistry.MustRegister(OrphanPodCleanedVolumesErrors)
|
||||
|
||||
for _, collector := range collectors {
|
||||
legacyregistry.CustomMustRegister(collector)
|
||||
|
Loading…
Reference in New Issue
Block a user