Add metrics for force cleaned mounts after failed reconstruction

Count nr. of force cleaned mounts + their failures after a volume fails
reconstruction.
This commit is contained in:
Jan Safranek 2023-02-22 13:00:35 +01:00
parent bd73aee9db
commit 9ca548fcf0
2 changed files with 32 additions and 11 deletions

View File

@ -31,8 +31,10 @@ const (
// Metric keys for Volume Manager. // Metric keys for Volume Manager.
volumeManagerTotalVolumes = "volume_manager_total_volumes" volumeManagerTotalVolumes = "volume_manager_total_volumes"
reconstructedVolumesTotal = "reconstructed_volumes_total" reconstructVolumeOperationsTotal = "reconstruct_volume_operations_total"
reconstructedVolumesErrorsTotal = "reconstructed_volumes_errors_total" reconstructVolumeOperationsErrorsTotal = "reconstruct_volume_operations_errors_total"
forceCleanedFailedVolumeOperationsTotal = "force_cleaned_failed_volume_operations_total"
forceCleanedFailedVolumeOperationsErrorsTotal = "force_cleaned_failed_volume_operation_errors_total"
) )
var ( var (
@ -46,20 +48,35 @@ var (
metrics.ALPHA, "", metrics.ALPHA, "",
) )
ReconstructedVolumesTotal = metrics.NewCounter( ReconstructVolumeOperationsTotal = metrics.NewCounter(
&metrics.CounterOpts{ &metrics.CounterOpts{
Name: reconstructedVolumesTotal, Name: reconstructVolumeOperationsTotal,
Help: "The number of volumes that were attempted to be reconstructed from the operating system during kubelet startup. This includes both successful and failed reconstruction.", Help: "The number of volumes that were attempted to be reconstructed from the operating system during kubelet startup. This includes both successful and failed reconstruction.",
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, },
) )
ReconstructedVolumesErrorsTotal = metrics.NewCounter( ReconstructVolumeOperationsErrorsTotal = metrics.NewCounter(
&metrics.CounterOpts{ &metrics.CounterOpts{
Name: reconstructedVolumesErrorsTotal, Name: reconstructVolumeOperationsErrorsTotal,
Help: "The number of volumes that failed reconstruction from the operating system during kubelet startup.", Help: "The number of volumes that failed reconstruction from the operating system during kubelet startup.",
StabilityLevel: metrics.ALPHA, StabilityLevel: metrics.ALPHA,
}, },
) )
ForceCleanedFailedVolumeOperationsTotal = metrics.NewCounter(
&metrics.CounterOpts{
Name: forceCleanedFailedVolumeOperationsTotal,
Help: "The number of volumes that were force cleaned after their reconstruction failed during kubelet startup. This includes both successful and failed cleanups.",
StabilityLevel: metrics.ALPHA,
},
)
ForceCleanedFailedVolumeOperationsErrorsTotal = metrics.NewCounter(
&metrics.CounterOpts{
Name: forceCleanedFailedVolumeOperationsErrorsTotal,
Help: "The number of volumes that failed force cleanup after their reconstruction failed during kubelet startup.",
StabilityLevel: metrics.ALPHA,
},
)
) )
// volumeCount is a map of maps used as a counter. // volumeCount is a map of maps used as a counter.
@ -78,8 +95,10 @@ func (v volumeCount) add(state, plugin string) {
func Register(asw cache.ActualStateOfWorld, dsw cache.DesiredStateOfWorld, pluginMgr *volume.VolumePluginMgr) { func Register(asw cache.ActualStateOfWorld, dsw cache.DesiredStateOfWorld, pluginMgr *volume.VolumePluginMgr) {
registerMetrics.Do(func() { registerMetrics.Do(func() {
legacyregistry.CustomMustRegister(&totalVolumesCollector{asw: asw, dsw: dsw, pluginMgr: pluginMgr}) legacyregistry.CustomMustRegister(&totalVolumesCollector{asw: asw, dsw: dsw, pluginMgr: pluginMgr})
legacyregistry.MustRegister(ReconstructedVolumesTotal) legacyregistry.MustRegister(ReconstructVolumeOperationsTotal)
legacyregistry.MustRegister(ReconstructedVolumesErrorsTotal) legacyregistry.MustRegister(ReconstructVolumeOperationsErrorsTotal)
legacyregistry.MustRegister(ForceCleanedFailedVolumeOperationsTotal)
legacyregistry.MustRegister(ForceCleanedFailedVolumeOperationsErrorsTotal)
}) })
} }

View File

@ -95,10 +95,12 @@ func (rc *reconciler) cleanupMounts(volume podVolume) {
PluginName: volume.pluginName, PluginName: volume.pluginName,
PodUID: types.UID(volume.podName), PodUID: types.UID(volume.podName),
} }
metrics.ForceCleanedFailedVolumeOperationsTotal.Inc()
// TODO: Currently cleanupMounts only includes UnmountVolume operation. In the next PR, we will add // TODO: Currently cleanupMounts only includes UnmountVolume operation. In the next PR, we will add
// to unmount both volume and device in the same routine. // to unmount both volume and device in the same routine.
err := rc.operationExecutor.UnmountVolume(mountedVolume, rc.actualStateOfWorld, rc.kubeletPodsDir) err := rc.operationExecutor.UnmountVolume(mountedVolume, rc.actualStateOfWorld, rc.kubeletPodsDir)
if err != nil { if err != nil {
metrics.ForceCleanedFailedVolumeOperationsErrorsTotal.Inc()
klog.ErrorS(err, mountedVolume.GenerateErrorDetailed("volumeHandler.UnmountVolumeHandler for UnmountVolume failed", err).Error()) klog.ErrorS(err, mountedVolume.GenerateErrorDetailed("volumeHandler.UnmountVolumeHandler for UnmountVolume failed", err).Error())
return return
} }
@ -179,10 +181,10 @@ func getVolumesFromPodDir(podDir string) ([]podVolume, error) {
// Reconstruct volume data structure by reading the pod's volume directories // Reconstruct volume data structure by reading the pod's volume directories
func (rc *reconciler) reconstructVolume(volume podVolume) (rvolume *reconstructedVolume, rerr error) { func (rc *reconciler) reconstructVolume(volume podVolume) (rvolume *reconstructedVolume, rerr error) {
metrics.ReconstructedVolumesTotal.Inc() metrics.ReconstructVolumeOperationsTotal.Inc()
defer func() { defer func() {
if rerr != nil { if rerr != nil {
metrics.ReconstructedVolumesErrorsTotal.Inc() metrics.ReconstructVolumeOperationsErrorsTotal.Inc()
} }
}() }()