Merge pull request #115972 from jsafrane/add-orphan-pod-metrics

Add metric for failed orphan pod cleanup
This commit is contained in:
Kubernetes Prow Robot 2023-03-09 22:43:26 -08:00 committed by GitHub
commit a408be817f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 43 additions and 0 deletions

View File

@ -28,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2" "k8s.io/klog/v2"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/util/removeall" "k8s.io/kubernetes/pkg/util/removeall"
"k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume"
volumetypes "k8s.io/kubernetes/pkg/volume/util/types" volumetypes "k8s.io/kubernetes/pkg/volume/util/types"
@ -181,16 +182,21 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
orphanRemovalErrors := []error{} orphanRemovalErrors := []error{}
orphanVolumeErrors := []error{} orphanVolumeErrors := []error{}
var totalPods, errorPods int
for _, uid := range found { for _, uid := range found {
if allPods.Has(string(uid)) { if allPods.Has(string(uid)) {
continue continue
} }
totalPods++
// If volumes have not been unmounted/detached, do not delete directory. // If volumes have not been unmounted/detached, do not delete directory.
// Doing so may result in corruption of data. // Doing so may result in corruption of data.
// TODO: getMountedVolumePathListFromDisk() call may be redundant with // TODO: getMountedVolumePathListFromDisk() call may be redundant with
// kl.getPodVolumePathListFromDisk(). Can this be cleaned up? // kl.getPodVolumePathListFromDisk(). Can this be cleaned up?
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist { if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
errorPods++
klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid) klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid)
continue continue
} }
@ -198,6 +204,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
// Attempt to remove the pod volumes directory and its subdirs // Attempt to remove the pod volumes directory and its subdirs
podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid) podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid)
if len(podVolumeErrors) > 0 { if len(podVolumeErrors) > 0 {
errorPods++
orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...) orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...)
// Not all volumes were removed, so don't clean up the pod directory yet. It is likely // Not all volumes were removed, so don't clean up the pod directory yet. It is likely
// that there are still mountpoints or files left which could cause removal of the pod // that there are still mountpoints or files left which could cause removal of the pod
@ -211,10 +218,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
podDir := kl.getPodDir(uid) podDir := kl.getPodDir(uid)
podSubdirs, err := os.ReadDir(podDir) podSubdirs, err := os.ReadDir(podDir)
if err != nil { if err != nil {
errorPods++
klog.ErrorS(err, "Could not read directory", "path", podDir) klog.ErrorS(err, "Could not read directory", "path", podDir)
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err)) orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err))
continue continue
} }
var cleanupFailed bool
for _, podSubdir := range podSubdirs { for _, podSubdir := range podSubdirs {
podSubdirName := podSubdir.Name() podSubdirName := podSubdir.Name()
podSubdirPath := filepath.Join(podDir, podSubdirName) podSubdirPath := filepath.Join(podDir, podSubdirName)
@ -222,11 +232,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
// as this could lead to data loss in some situations. The volumes // as this could lead to data loss in some situations. The volumes
// directory should have been removed by removeOrphanedPodVolumeDirs. // directory should have been removed by removeOrphanedPodVolumeDirs.
if podSubdirName == "volumes" { if podSubdirName == "volumes" {
cleanupFailed = true
err := fmt.Errorf("volumes subdir was found after it was removed") err := fmt.Errorf("volumes subdir was found after it was removed")
klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath) klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath)
continue continue
} }
if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil { if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil {
cleanupFailed = true
klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath) klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath)
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err)) orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err))
} }
@ -235,9 +247,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
// Rmdir the pod dir, which should be empty if everything above was successful // Rmdir the pod dir, which should be empty if everything above was successful
klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid) klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid)
if err := syscall.Rmdir(podDir); err != nil { if err := syscall.Rmdir(podDir); err != nil {
cleanupFailed = true
klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid) klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid)
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err)) orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err))
} }
if cleanupFailed {
errorPods++
}
} }
logSpew := func(errs []error) { logSpew := func(errs []error) {
@ -250,5 +266,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
} }
logSpew(orphanVolumeErrors) logSpew(orphanVolumeErrors)
logSpew(orphanRemovalErrors) logSpew(orphanRemovalErrors)
metrics.OrphanPodCleanedVolumes.Set(float64(totalPods))
metrics.OrphanPodCleanedVolumesErrors.Set(float64(errorPods))
return utilerrors.NewAggregate(orphanRemovalErrors) return utilerrors.NewAggregate(orphanRemovalErrors)
} }

View File

@ -102,6 +102,10 @@ const (
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total" TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms" TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"
// Metrics to track orphan pod cleanup
orphanPodCleanedVolumesKey = "orphan_pod_cleaned_volumes"
orphanPodCleanedVolumesErrorsKey = "orphan_pod_cleaned_volumes_errors"
// Values used in metric labels // Values used in metric labels
Container = "container" Container = "container"
InitContainer = "init_container" InitContainer = "init_container"
@ -649,6 +653,25 @@ var (
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, },
) )
// OrphanPodCleanedVolumes is number of orphaned Pods that times that removeOrphanedPodVolumeDirs was called during the last sweep.
OrphanPodCleanedVolumes = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: orphanPodCleanedVolumesKey,
Help: "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.",
StabilityLevel: metrics.ALPHA,
},
)
// OrphanPodCleanedVolumes is number of times that removeOrphanedPodVolumeDirs failed.
OrphanPodCleanedVolumesErrors = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: orphanPodCleanedVolumesErrorsKey,
Help: "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.",
StabilityLevel: metrics.ALPHA,
},
)
) )
var registerMetrics sync.Once var registerMetrics sync.Once
@ -709,6 +732,8 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal) legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionDuration) legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
legacyregistry.MustRegister(OrphanPodCleanedVolumes)
legacyregistry.MustRegister(OrphanPodCleanedVolumesErrors)
for _, collector := range collectors { for _, collector := range collectors {
legacyregistry.CustomMustRegister(collector) legacyregistry.CustomMustRegister(collector)