From bd73aee9dbaa146cc6d1961e1d83efac9280d0c6 Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Wed, 22 Feb 2023 12:59:47 +0100 Subject: [PATCH 1/2] Add volume reconstruction metrics Count nr. of volumes that kubelet tried to reconstruct + reconstruction errors. --- pkg/kubelet/volumemanager/metrics/metrics.go | 21 ++++++++++++++++++- .../reconciler/reconstruct_common.go | 10 ++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/volumemanager/metrics/metrics.go b/pkg/kubelet/volumemanager/metrics/metrics.go index e9e4bd04fb6..282bd8c3ec4 100644 --- a/pkg/kubelet/volumemanager/metrics/metrics.go +++ b/pkg/kubelet/volumemanager/metrics/metrics.go @@ -30,7 +30,9 @@ const ( pluginNameNotAvailable = "N/A" // Metric keys for Volume Manager. - volumeManagerTotalVolumes = "volume_manager_total_volumes" + volumeManagerTotalVolumes = "volume_manager_total_volumes" + reconstructedVolumesTotal = "reconstructed_volumes_total" + reconstructedVolumesErrorsTotal = "reconstructed_volumes_errors_total" ) var ( @@ -43,6 +45,21 @@ var ( nil, metrics.ALPHA, "", ) + + ReconstructedVolumesTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: reconstructedVolumesTotal, + Help: "The number of volumes that were attempted to be reconstructed from the operating system during kubelet startup. This includes both successful and failed reconstruction.", + StabilityLevel: metrics.ALPHA, + }, + ) + ReconstructedVolumesErrorsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: reconstructedVolumesErrorsTotal, + Help: "The number of volumes that failed reconstruction from the operating system during kubelet startup.", + StabilityLevel: metrics.ALPHA, + }, + ) ) // volumeCount is a map of maps used as a counter. @@ -61,6 +78,8 @@ func (v volumeCount) add(state, plugin string) { func Register(asw cache.ActualStateOfWorld, dsw cache.DesiredStateOfWorld, pluginMgr *volume.VolumePluginMgr) { registerMetrics.Do(func() { legacyregistry.CustomMustRegister(&totalVolumesCollector{asw: asw, dsw: dsw, pluginMgr: pluginMgr}) + legacyregistry.MustRegister(ReconstructedVolumesTotal) + legacyregistry.MustRegister(ReconstructedVolumesErrorsTotal) }) } diff --git a/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go b/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go index a4f5a444ddd..e69d0c4f189 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go +++ b/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/kubelet/config" + "k8s.io/kubernetes/pkg/kubelet/volumemanager/metrics" volumepkg "k8s.io/kubernetes/pkg/volume" "k8s.io/kubernetes/pkg/volume/util" "k8s.io/kubernetes/pkg/volume/util/operationexecutor" @@ -177,7 +178,14 @@ func getVolumesFromPodDir(podDir string) ([]podVolume, error) { } // Reconstruct volume data structure by reading the pod's volume directories -func (rc *reconciler) reconstructVolume(volume podVolume) (*reconstructedVolume, error) { +func (rc *reconciler) reconstructVolume(volume podVolume) (rvolume *reconstructedVolume, rerr error) { + metrics.ReconstructedVolumesTotal.Inc() + defer func() { + if rerr != nil { + metrics.ReconstructedVolumesErrorsTotal.Inc() + } + }() + // plugin initializations plugin, err := rc.volumePluginMgr.FindPluginByName(volume.pluginName) if err != nil { From 9ca548fcf0885a9e30ff1fff4f034c6b8563b76f Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Wed, 22 Feb 2023 13:00:35 +0100 Subject: [PATCH 2/2] Add metrics for force cleaned mounts after failed reconstruction Count nr. of force cleaned mounts + their failures after a volume fails reconstruction. --- pkg/kubelet/volumemanager/metrics/metrics.go | 37 ++++++++++++++----- .../reconciler/reconstruct_common.go | 6 ++- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/volumemanager/metrics/metrics.go b/pkg/kubelet/volumemanager/metrics/metrics.go index 282bd8c3ec4..dd849c71245 100644 --- a/pkg/kubelet/volumemanager/metrics/metrics.go +++ b/pkg/kubelet/volumemanager/metrics/metrics.go @@ -30,9 +30,11 @@ const ( pluginNameNotAvailable = "N/A" // Metric keys for Volume Manager. - volumeManagerTotalVolumes = "volume_manager_total_volumes" - reconstructedVolumesTotal = "reconstructed_volumes_total" - reconstructedVolumesErrorsTotal = "reconstructed_volumes_errors_total" + volumeManagerTotalVolumes = "volume_manager_total_volumes" + reconstructVolumeOperationsTotal = "reconstruct_volume_operations_total" + reconstructVolumeOperationsErrorsTotal = "reconstruct_volume_operations_errors_total" + forceCleanedFailedVolumeOperationsTotal = "force_cleaned_failed_volume_operations_total" + forceCleanedFailedVolumeOperationsErrorsTotal = "force_cleaned_failed_volume_operation_errors_total" ) var ( @@ -46,20 +48,35 @@ var ( metrics.ALPHA, "", ) - ReconstructedVolumesTotal = metrics.NewCounter( + ReconstructVolumeOperationsTotal = metrics.NewCounter( &metrics.CounterOpts{ - Name: reconstructedVolumesTotal, + Name: reconstructVolumeOperationsTotal, Help: "The number of volumes that were attempted to be reconstructed from the operating system during kubelet startup. This includes both successful and failed reconstruction.", StabilityLevel: metrics.ALPHA, }, ) - ReconstructedVolumesErrorsTotal = metrics.NewCounter( + ReconstructVolumeOperationsErrorsTotal = metrics.NewCounter( &metrics.CounterOpts{ - Name: reconstructedVolumesErrorsTotal, + Name: reconstructVolumeOperationsErrorsTotal, Help: "The number of volumes that failed reconstruction from the operating system during kubelet startup.", StabilityLevel: metrics.ALPHA, }, ) + + ForceCleanedFailedVolumeOperationsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: forceCleanedFailedVolumeOperationsTotal, + Help: "The number of volumes that were force cleaned after their reconstruction failed during kubelet startup. This includes both successful and failed cleanups.", + StabilityLevel: metrics.ALPHA, + }, + ) + ForceCleanedFailedVolumeOperationsErrorsTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Name: forceCleanedFailedVolumeOperationsErrorsTotal, + Help: "The number of volumes that failed force cleanup after their reconstruction failed during kubelet startup.", + StabilityLevel: metrics.ALPHA, + }, + ) ) // volumeCount is a map of maps used as a counter. @@ -78,8 +95,10 @@ func (v volumeCount) add(state, plugin string) { func Register(asw cache.ActualStateOfWorld, dsw cache.DesiredStateOfWorld, pluginMgr *volume.VolumePluginMgr) { registerMetrics.Do(func() { legacyregistry.CustomMustRegister(&totalVolumesCollector{asw: asw, dsw: dsw, pluginMgr: pluginMgr}) - legacyregistry.MustRegister(ReconstructedVolumesTotal) - legacyregistry.MustRegister(ReconstructedVolumesErrorsTotal) + legacyregistry.MustRegister(ReconstructVolumeOperationsTotal) + legacyregistry.MustRegister(ReconstructVolumeOperationsErrorsTotal) + legacyregistry.MustRegister(ForceCleanedFailedVolumeOperationsTotal) + legacyregistry.MustRegister(ForceCleanedFailedVolumeOperationsErrorsTotal) }) } diff --git a/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go b/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go index e69d0c4f189..255b89410cb 100644 --- a/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go +++ b/pkg/kubelet/volumemanager/reconciler/reconstruct_common.go @@ -95,10 +95,12 @@ func (rc *reconciler) cleanupMounts(volume podVolume) { PluginName: volume.pluginName, PodUID: types.UID(volume.podName), } + metrics.ForceCleanedFailedVolumeOperationsTotal.Inc() // TODO: Currently cleanupMounts only includes UnmountVolume operation. In the next PR, we will add // to unmount both volume and device in the same routine. err := rc.operationExecutor.UnmountVolume(mountedVolume, rc.actualStateOfWorld, rc.kubeletPodsDir) if err != nil { + metrics.ForceCleanedFailedVolumeOperationsErrorsTotal.Inc() klog.ErrorS(err, mountedVolume.GenerateErrorDetailed("volumeHandler.UnmountVolumeHandler for UnmountVolume failed", err).Error()) return } @@ -179,10 +181,10 @@ func getVolumesFromPodDir(podDir string) ([]podVolume, error) { // Reconstruct volume data structure by reading the pod's volume directories func (rc *reconciler) reconstructVolume(volume podVolume) (rvolume *reconstructedVolume, rerr error) { - metrics.ReconstructedVolumesTotal.Inc() + metrics.ReconstructVolumeOperationsTotal.Inc() defer func() { if rerr != nil { - metrics.ReconstructedVolumesErrorsTotal.Inc() + metrics.ReconstructVolumeOperationsErrorsTotal.Inc() } }()