diff --git a/pkg/kubelet/kubelet_volumes.go b/pkg/kubelet/kubelet_volumes.go index 85d5dd41d5f..2eb91338e24 100644 --- a/pkg/kubelet/kubelet_volumes.go +++ b/pkg/kubelet/kubelet_volumes.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/util/removeall" "k8s.io/kubernetes/pkg/volume" volumetypes "k8s.io/kubernetes/pkg/volume/util/types" @@ -181,16 +182,21 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon orphanRemovalErrors := []error{} orphanVolumeErrors := []error{} + var totalPods, errorPods int for _, uid := range found { if allPods.Has(string(uid)) { continue } + + totalPods++ + // If volumes have not been unmounted/detached, do not delete directory. // Doing so may result in corruption of data. // TODO: getMountedVolumePathListFromDisk() call may be redundant with // kl.getPodVolumePathListFromDisk(). Can this be cleaned up? if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist { + errorPods++ klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid) continue } @@ -198,6 +204,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon // Attempt to remove the pod volumes directory and its subdirs podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid) if len(podVolumeErrors) > 0 { + errorPods++ orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...) // Not all volumes were removed, so don't clean up the pod directory yet. It is likely // that there are still mountpoints or files left which could cause removal of the pod @@ -211,10 +218,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon podDir := kl.getPodDir(uid) podSubdirs, err := os.ReadDir(podDir) if err != nil { + errorPods++ klog.ErrorS(err, "Could not read directory", "path", podDir) orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err)) continue } + + var cleanupFailed bool for _, podSubdir := range podSubdirs { podSubdirName := podSubdir.Name() podSubdirPath := filepath.Join(podDir, podSubdirName) @@ -222,11 +232,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon // as this could lead to data loss in some situations. The volumes // directory should have been removed by removeOrphanedPodVolumeDirs. if podSubdirName == "volumes" { + cleanupFailed = true err := fmt.Errorf("volumes subdir was found after it was removed") klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath) continue } if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil { + cleanupFailed = true klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath) orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err)) } @@ -235,9 +247,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon // Rmdir the pod dir, which should be empty if everything above was successful klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid) if err := syscall.Rmdir(podDir); err != nil { + cleanupFailed = true klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid) orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err)) } + if cleanupFailed { + errorPods++ + } } logSpew := func(errs []error) { @@ -250,5 +266,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon } logSpew(orphanVolumeErrors) logSpew(orphanRemovalErrors) + metrics.OrphanPodCleanedVolumes.Set(float64(totalPods)) + metrics.OrphanPodCleanedVolumesErrors.Set(float64(errorPods)) return utilerrors.NewAggregate(orphanRemovalErrors) } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 5db31d5d213..a3eca5a37e5 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -102,6 +102,10 @@ const ( TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total" TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms" + // Metrics to track orphan pod cleanup + orphanPodCleanedVolumesKey = "orphan_pod_cleaned_volumes" + orphanPodCleanedVolumesErrorsKey = "orphan_pod_cleaned_volumes_errors" + // Values used in metric labels Container = "container" InitContainer = "init_container" @@ -649,6 +653,25 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + + // OrphanPodCleanedVolumes is number of orphaned Pods that times that removeOrphanedPodVolumeDirs was called during the last sweep. + OrphanPodCleanedVolumes = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: orphanPodCleanedVolumesKey, + Help: "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.", + StabilityLevel: metrics.ALPHA, + }, + ) + // OrphanPodCleanedVolumes is number of times that removeOrphanedPodVolumeDirs failed. + OrphanPodCleanedVolumesErrors = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: orphanPodCleanedVolumesErrorsKey, + Help: "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.", + StabilityLevel: metrics.ALPHA, + }, + ) ) var registerMetrics sync.Once @@ -709,6 +732,8 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionDuration) + legacyregistry.MustRegister(OrphanPodCleanedVolumes) + legacyregistry.MustRegister(OrphanPodCleanedVolumesErrors) for _, collector := range collectors { legacyregistry.CustomMustRegister(collector)