diff --git a/pkg/kubelet/volumemanager/metrics/metrics.go b/pkg/kubelet/volumemanager/metrics/metrics.go index e9e4bd04fb6..dd849c71245 100644 --- a/pkg/kubelet/volumemanager/metrics/metrics.go +++ b/pkg/kubelet/volumemanager/metrics/metrics.go @@ -30,7 +30,11 @@ const ( pluginNameNotAvailable = "N/A" // Metric keys for Volume Manager. - volumeManagerTotalVolumes = "volume_manager_total_volumes" + volumeManagerTotalVolumes = "volume_manager_total_volumes" + reconstructVolumeOperationsTotal = "reconstruct_volume_operations_total" + reconstructVolumeOperationsErrorsTotal = "reconstruct_volume_operations_errors_total" + forceCleanedFailedVolumeOperationsTotal = "force_cleaned_failed_volume_operations_total" + forceCleanedFailedVolumeOperationsErrorsTotal = "force_cleaned_failed_volume_operation_errors_total" ) var ( @@ -43,6 +47,36 @@ var ( nil, metrics.ALPHA, "", ) + + ReconstructVolumeOperationsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: reconstructVolumeOperationsTotal, + Help: "The number of volumes that were attempted to be reconstructed from the operating system during kubelet startup. This includes both successful and failed reconstruction.", + StabilityLevel: metrics.ALPHA, + }, + ) + ReconstructVolumeOperationsErrorsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: reconstructVolumeOperationsErrorsTotal, + Help: "The number of volumes that failed reconstruction from the operating system during kubelet startup.", + StabilityLevel: metrics.ALPHA, + }, + ) + + ForceCleanedFailedVolumeOperationsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: forceCleanedFailedVolumeOperationsTotal, + Help: "The number of volumes that were force cleaned after their reconstruction failed during kubelet startup. This includes both successful and failed cleanups.", + StabilityLevel: metrics.ALPHA, + }, + ) + ForceCleanedFailedVolumeOperationsErrorsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: forceCleanedFailedVolumeOperationsErrorsTotal, + Help: "The number of volumes that failed force cleanup after their reconstruction failed during kubelet startup.", + StabilityLevel: metrics.ALPHA, + }, + ) ) // volumeCount is a map of maps used as a counter. @@ -61,6 +95,10 @@ func (v volumeCount) add(state, plugin string) { func Register(asw cache.ActualStateOfWorld, dsw cache.DesiredStateOfWorld, pluginMgr *volume.VolumePluginMgr) { registerMetrics.Do(func() { legacyregistry.CustomMustRegister(&totalVolumesCollector{asw: asw, dsw: dsw, pluginMgr: pluginMgr}) + legacyregistry.MustRegister(ReconstructVolumeOperationsTotal) + legacyregistry.MustRegister(ReconstructVolumeOperationsErrorsTotal) + legacyregistry.MustRegister(ForceCleanedFailedVolumeOperationsTotal) + legacyregistry.MustRegister(ForceCleanedFailedVolumeOperationsErrorsTotal) }) } diff --git a/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go b/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go index a4f5a444ddd..255b89410cb 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go +++ b/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/kubelet/config" + "k8s.io/kubernetes/pkg/kubelet/volumemanager/metrics" volumepkg "k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume/util" "k8s.io/kubernetes/pkg/volume/util/operationexecutor" @@ -94,10 +95,12 @@ func (rc *reconciler) cleanupMounts(volume podVolume) { PluginName: volume.pluginName, PodUID: types.UID(volume.podName), } + metrics.ForceCleanedFailedVolumeOperationsTotal.Inc() // TODO: Currently cleanupMounts only includes UnmountVolume operation. In the next PR, we will add // to unmount both volume and device in the same routine. err := rc.operationExecutor.UnmountVolume(mountedVolume, rc.actualStateOfWorld, rc.kubeletPodsDir) if err != nil { + metrics.ForceCleanedFailedVolumeOperationsErrorsTotal.Inc() klog.ErrorS(err, mountedVolume.GenerateErrorDetailed("volumeHandler.UnmountVolumeHandler for UnmountVolume failed", err).Error()) return } @@ -177,7 +180,14 @@ func getVolumesFromPodDir(podDir string) ([]podVolume, error) { } // Reconstruct volume data structure by reading the pod's volume directories -func (rc *reconciler) reconstructVolume(volume podVolume) (*reconstructedVolume, error) { +func (rc *reconciler) reconstructVolume(volume podVolume) (rvolume *reconstructedVolume, rerr error) { + metrics.ReconstructVolumeOperationsTotal.Inc() + defer func() { + if rerr != nil { + metrics.ReconstructVolumeOperationsErrorsTotal.Inc() + } + }() + // plugin initializations plugin, err := rc.volumePluginMgr.FindPluginByName(volume.pluginName) if err != nil {