Merge pull request #115972 from jsafrane/add-orphan-pod-metrics

Add metric for failed orphan pod cleanup
This commit is contained in:
Kubernetes Prow Robot 2023-03-09 22:43:26 -08:00 committed by GitHub
commit a408be817f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 43 additions and 0 deletions

View File

@ -28,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/util/removeall"
"k8s.io/kubernetes/pkg/volume"
volumetypes "k8s.io/kubernetes/pkg/volume/util/types"
@ -181,16 +182,21 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
orphanRemovalErrors := []error{}
orphanVolumeErrors := []error{}
var totalPods, errorPods int
for _, uid := range found {
if allPods.Has(string(uid)) {
continue
}
totalPods++
// If volumes have not been unmounted/detached, do not delete directory.
// Doing so may result in corruption of data.
// TODO: getMountedVolumePathListFromDisk() call may be redundant with
// kl.getPodVolumePathListFromDisk(). Can this be cleaned up?
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
errorPods++
klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid)
continue
}
@ -198,6 +204,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
// Attempt to remove the pod volumes directory and its subdirs
podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid)
if len(podVolumeErrors) > 0 {
errorPods++
orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...)
// Not all volumes were removed, so don't clean up the pod directory yet. It is likely
// that there are still mountpoints or files left which could cause removal of the pod
@ -211,10 +218,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
podDir := kl.getPodDir(uid)
podSubdirs, err := os.ReadDir(podDir)
if err != nil {
errorPods++
klog.ErrorS(err, "Could not read directory", "path", podDir)
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err))
continue
}
var cleanupFailed bool
for _, podSubdir := range podSubdirs {
podSubdirName := podSubdir.Name()
podSubdirPath := filepath.Join(podDir, podSubdirName)
@ -222,11 +232,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
// as this could lead to data loss in some situations. The volumes
// directory should have been removed by removeOrphanedPodVolumeDirs.
if podSubdirName == "volumes" {
cleanupFailed = true
err := fmt.Errorf("volumes subdir was found after it was removed")
klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath)
continue
}
if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil {
cleanupFailed = true
klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath)
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err))
}
@ -235,9 +247,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
// Rmdir the pod dir, which should be empty if everything above was successful
klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid)
if err := syscall.Rmdir(podDir); err != nil {
cleanupFailed = true
klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid)
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err))
}
if cleanupFailed {
errorPods++
}
}
logSpew := func(errs []error) {
@ -250,5 +266,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
}
logSpew(orphanVolumeErrors)
logSpew(orphanRemovalErrors)
metrics.OrphanPodCleanedVolumes.Set(float64(totalPods))
metrics.OrphanPodCleanedVolumesErrors.Set(float64(errorPods))
return utilerrors.NewAggregate(orphanRemovalErrors)
}

View File

@ -102,6 +102,10 @@ const (
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"
// Metrics to track orphan pod cleanup
orphanPodCleanedVolumesKey = "orphan_pod_cleaned_volumes"
orphanPodCleanedVolumesErrorsKey = "orphan_pod_cleaned_volumes_errors"
// Values used in metric labels
Container = "container"
InitContainer = "init_container"
@ -649,6 +653,25 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// OrphanPodCleanedVolumes is number of orphaned Pods that times that removeOrphanedPodVolumeDirs was called during the last sweep.
OrphanPodCleanedVolumes = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: orphanPodCleanedVolumesKey,
Help: "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.",
StabilityLevel: metrics.ALPHA,
},
)
// OrphanPodCleanedVolumes is number of times that removeOrphanedPodVolumeDirs failed.
OrphanPodCleanedVolumesErrors = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: orphanPodCleanedVolumesErrorsKey,
Help: "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.",
StabilityLevel: metrics.ALPHA,
},
)
)
var registerMetrics sync.Once
@ -709,6 +732,8 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
legacyregistry.MustRegister(OrphanPodCleanedVolumes)
legacyregistry.MustRegister(OrphanPodCleanedVolumesErrors)
for _, collector := range collectors {
legacyregistry.CustomMustRegister(collector)