mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 19:31:44 +00:00
Merge pull request #115972 from jsafrane/add-orphan-pod-metrics
Add metric for failed orphan pod cleanup
This commit is contained in:
commit
a408be817f
@ -28,6 +28,7 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/util/sets"
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
"k8s.io/klog/v2"
|
"k8s.io/klog/v2"
|
||||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
"k8s.io/kubernetes/pkg/util/removeall"
|
"k8s.io/kubernetes/pkg/util/removeall"
|
||||||
"k8s.io/kubernetes/pkg/volume"
|
"k8s.io/kubernetes/pkg/volume"
|
||||||
volumetypes "k8s.io/kubernetes/pkg/volume/util/types"
|
volumetypes "k8s.io/kubernetes/pkg/volume/util/types"
|
||||||
@ -181,16 +182,21 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
|||||||
|
|
||||||
orphanRemovalErrors := []error{}
|
orphanRemovalErrors := []error{}
|
||||||
orphanVolumeErrors := []error{}
|
orphanVolumeErrors := []error{}
|
||||||
|
var totalPods, errorPods int
|
||||||
|
|
||||||
for _, uid := range found {
|
for _, uid := range found {
|
||||||
if allPods.Has(string(uid)) {
|
if allPods.Has(string(uid)) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
totalPods++
|
||||||
|
|
||||||
// If volumes have not been unmounted/detached, do not delete directory.
|
// If volumes have not been unmounted/detached, do not delete directory.
|
||||||
// Doing so may result in corruption of data.
|
// Doing so may result in corruption of data.
|
||||||
// TODO: getMountedVolumePathListFromDisk() call may be redundant with
|
// TODO: getMountedVolumePathListFromDisk() call may be redundant with
|
||||||
// kl.getPodVolumePathListFromDisk(). Can this be cleaned up?
|
// kl.getPodVolumePathListFromDisk(). Can this be cleaned up?
|
||||||
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
|
if podVolumesExist := kl.podVolumesExist(uid); podVolumesExist {
|
||||||
|
errorPods++
|
||||||
klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid)
|
klog.V(3).InfoS("Orphaned pod found, but volumes are not cleaned up", "podUID", uid)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -198,6 +204,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
|||||||
// Attempt to remove the pod volumes directory and its subdirs
|
// Attempt to remove the pod volumes directory and its subdirs
|
||||||
podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid)
|
podVolumeErrors := kl.removeOrphanedPodVolumeDirs(uid)
|
||||||
if len(podVolumeErrors) > 0 {
|
if len(podVolumeErrors) > 0 {
|
||||||
|
errorPods++
|
||||||
orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...)
|
orphanVolumeErrors = append(orphanVolumeErrors, podVolumeErrors...)
|
||||||
// Not all volumes were removed, so don't clean up the pod directory yet. It is likely
|
// Not all volumes were removed, so don't clean up the pod directory yet. It is likely
|
||||||
// that there are still mountpoints or files left which could cause removal of the pod
|
// that there are still mountpoints or files left which could cause removal of the pod
|
||||||
@ -211,10 +218,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
|||||||
podDir := kl.getPodDir(uid)
|
podDir := kl.getPodDir(uid)
|
||||||
podSubdirs, err := os.ReadDir(podDir)
|
podSubdirs, err := os.ReadDir(podDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
errorPods++
|
||||||
klog.ErrorS(err, "Could not read directory", "path", podDir)
|
klog.ErrorS(err, "Could not read directory", "path", podDir)
|
||||||
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err))
|
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred during reading the pod dir from disk: %v", uid, err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var cleanupFailed bool
|
||||||
for _, podSubdir := range podSubdirs {
|
for _, podSubdir := range podSubdirs {
|
||||||
podSubdirName := podSubdir.Name()
|
podSubdirName := podSubdir.Name()
|
||||||
podSubdirPath := filepath.Join(podDir, podSubdirName)
|
podSubdirPath := filepath.Join(podDir, podSubdirName)
|
||||||
@ -222,11 +232,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
|||||||
// as this could lead to data loss in some situations. The volumes
|
// as this could lead to data loss in some situations. The volumes
|
||||||
// directory should have been removed by removeOrphanedPodVolumeDirs.
|
// directory should have been removed by removeOrphanedPodVolumeDirs.
|
||||||
if podSubdirName == "volumes" {
|
if podSubdirName == "volumes" {
|
||||||
|
cleanupFailed = true
|
||||||
err := fmt.Errorf("volumes subdir was found after it was removed")
|
err := fmt.Errorf("volumes subdir was found after it was removed")
|
||||||
klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath)
|
klog.ErrorS(err, "Orphaned pod found, but failed to remove volumes subdir", "podUID", uid, "path", podSubdirPath)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil {
|
if err := removeall.RemoveAllOneFilesystem(kl.mounter, podSubdirPath); err != nil {
|
||||||
|
cleanupFailed = true
|
||||||
klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath)
|
klog.ErrorS(err, "Failed to remove orphaned pod subdir", "podUID", uid, "path", podSubdirPath)
|
||||||
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err))
|
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove subdir %q: %v", uid, podSubdirPath, err))
|
||||||
}
|
}
|
||||||
@ -235,9 +247,13 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
|||||||
// Rmdir the pod dir, which should be empty if everything above was successful
|
// Rmdir the pod dir, which should be empty if everything above was successful
|
||||||
klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid)
|
klog.V(3).InfoS("Orphaned pod found, removing", "podUID", uid)
|
||||||
if err := syscall.Rmdir(podDir); err != nil {
|
if err := syscall.Rmdir(podDir); err != nil {
|
||||||
|
cleanupFailed = true
|
||||||
klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid)
|
klog.ErrorS(err, "Failed to remove orphaned pod dir", "podUID", uid)
|
||||||
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err))
|
orphanRemovalErrors = append(orphanRemovalErrors, fmt.Errorf("orphaned pod %q found, but error occurred when trying to remove the pod directory: %v", uid, err))
|
||||||
}
|
}
|
||||||
|
if cleanupFailed {
|
||||||
|
errorPods++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logSpew := func(errs []error) {
|
logSpew := func(errs []error) {
|
||||||
@ -250,5 +266,7 @@ func (kl *Kubelet) cleanupOrphanedPodDirs(pods []*v1.Pod, runningPods []*kubecon
|
|||||||
}
|
}
|
||||||
logSpew(orphanVolumeErrors)
|
logSpew(orphanVolumeErrors)
|
||||||
logSpew(orphanRemovalErrors)
|
logSpew(orphanRemovalErrors)
|
||||||
|
metrics.OrphanPodCleanedVolumes.Set(float64(totalPods))
|
||||||
|
metrics.OrphanPodCleanedVolumesErrors.Set(float64(errorPods))
|
||||||
return utilerrors.NewAggregate(orphanRemovalErrors)
|
return utilerrors.NewAggregate(orphanRemovalErrors)
|
||||||
}
|
}
|
||||||
|
@ -102,6 +102,10 @@ const (
|
|||||||
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
|
||||||
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"
|
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"
|
||||||
|
|
||||||
|
// Metrics to track orphan pod cleanup
|
||||||
|
orphanPodCleanedVolumesKey = "orphan_pod_cleaned_volumes"
|
||||||
|
orphanPodCleanedVolumesErrorsKey = "orphan_pod_cleaned_volumes_errors"
|
||||||
|
|
||||||
// Values used in metric labels
|
// Values used in metric labels
|
||||||
Container = "container"
|
Container = "container"
|
||||||
InitContainer = "init_container"
|
InitContainer = "init_container"
|
||||||
@ -649,6 +653,25 @@ var (
|
|||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// OrphanPodCleanedVolumes is number of orphaned Pods that times that removeOrphanedPodVolumeDirs was called during the last sweep.
|
||||||
|
OrphanPodCleanedVolumes = metrics.NewGauge(
|
||||||
|
&metrics.GaugeOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: orphanPodCleanedVolumesKey,
|
||||||
|
Help: "The total number of orphaned Pods whose volumes were cleaned in the last periodic sweep.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
// OrphanPodCleanedVolumes is number of times that removeOrphanedPodVolumeDirs failed.
|
||||||
|
OrphanPodCleanedVolumesErrors = metrics.NewGauge(
|
||||||
|
&metrics.GaugeOpts{
|
||||||
|
Subsystem: KubeletSubsystem,
|
||||||
|
Name: orphanPodCleanedVolumesErrorsKey,
|
||||||
|
Help: "The number of orphaned Pods whose volumes failed to be cleaned in the last periodic sweep.",
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
var registerMetrics sync.Once
|
var registerMetrics sync.Once
|
||||||
@ -709,6 +732,8 @@ func Register(collectors ...metrics.StableCollector) {
|
|||||||
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
|
||||||
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
|
||||||
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
|
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)
|
||||||
|
legacyregistry.MustRegister(OrphanPodCleanedVolumes)
|
||||||
|
legacyregistry.MustRegister(OrphanPodCleanedVolumesErrors)
|
||||||
|
|
||||||
for _, collector := range collectors {
|
for _, collector := range collectors {
|
||||||
legacyregistry.CustomMustRegister(collector)
|
legacyregistry.CustomMustRegister(collector)
|
||||||
|
Loading…
Reference in New Issue
Block a user