diff --git a/pkg/controller/volume/attachdetach/metrics/metrics.go b/pkg/controller/volume/attachdetach/metrics/metrics.go index bd420e31764..f0c487a7a2b 100644 --- a/pkg/controller/volume/attachdetach/metrics/metrics.go +++ b/pkg/controller/volume/attachdetach/metrics/metrics.go @@ -35,6 +35,14 @@ import ( const pluginNameNotAvailable = "N/A" +const ( + // Force detach reason is timeout + ForceDetachReasonTimeout = "timeout" + // Force detach reason is the node has an out-of-service taint + ForceDetachReasonOutOfService = "out-of-service" + attachDetachController = "attach_detach_controller" +) + var ( inUseVolumeMetricDesc = metrics.NewDesc( metrics.BuildFQName("", "storage_count", "attachable_volumes_in_use"), @@ -48,12 +56,15 @@ var ( []string{"plugin_name", "state"}, nil, metrics.ALPHA, "") - forcedDetachMetricCounter = metrics.NewCounter( + ForceDetachMetricCounter = metrics.NewCounterVec( &metrics.CounterOpts{ + Subsystem: attachDetachController, Name: "attachdetach_controller_forced_detaches", Help: "Number of times the A/D Controller performed a forced detach", StabilityLevel: metrics.ALPHA, - }) + }, + []string{"reason"}, + ) ) var registerMetrics sync.Once @@ -75,7 +86,7 @@ func Register(pvcLister corelisters.PersistentVolumeClaimLister, pluginMgr, csiMigratedPluginManager, intreeToCSITranslator)) - legacyregistry.MustRegister(forcedDetachMetricCounter) + legacyregistry.MustRegister(ForceDetachMetricCounter) }) } @@ -209,6 +220,6 @@ func (collector *attachDetachStateCollector) getTotalVolumesCount() volumeCount } // RecordForcedDetachMetric register a forced detach metric. -func RecordForcedDetachMetric() { - forcedDetachMetricCounter.Inc() +func RecordForcedDetachMetric(forceDetachReason string) { + ForceDetachMetricCounter.WithLabelValues(forceDetachReason).Inc() } diff --git a/pkg/controller/volume/attachdetach/reconciler/reconciler.go b/pkg/controller/volume/attachdetach/reconciler/reconciler.go index 89e4e847205..cc3de29db85 100644 --- a/pkg/controller/volume/attachdetach/reconciler/reconciler.go +++ b/pkg/controller/volume/attachdetach/reconciler/reconciler.go @@ -269,14 +269,21 @@ func (rc *reconciler) reconcile(ctx context.Context) { verifySafeToDetach := !(timeout || hasOutOfServiceTaint) err = rc.attacherDetacher.DetachVolume(logger, attachedVolume.AttachedVolume, verifySafeToDetach, rc.actualStateOfWorld) if err == nil { - if !timeout { + if verifySafeToDetach { // normal detach logger.Info("attacherDetacher.DetachVolume started", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName) - } else { - metrics.RecordForcedDetachMetric() - logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching", - "duration", rc.maxWaitForUnmountDuration, - "node", klog.KRef("", string(attachedVolume.NodeName)), - "volumeName", attachedVolume.VolumeName) + } else { // force detach + if timeout { + metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonTimeout) + logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching", + "duration", rc.maxWaitForUnmountDuration, + "node", klog.KRef("", string(attachedVolume.NodeName)), + "volumeName", attachedVolume.VolumeName) + } else { + metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonOutOfService) + logger.Info("attacherDetacher.DetachVolume started: node has out-of-service taint, force detaching", + "node", klog.KRef("", string(attachedVolume.NodeName)), + "volumeName", attachedVolume.VolumeName) + } } } if err != nil { diff --git a/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go b/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go index 2110f66fa5e..9ef77a84d81 100644 --- a/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go +++ b/pkg/controller/volume/attachdetach/reconciler/reconciler_test.go @@ -19,6 +19,7 @@ package reconciler import ( "context" "fmt" + "sync" "testing" "time" @@ -30,10 +31,13 @@ import ( "k8s.io/client-go/informers" "k8s.io/client-go/tools/record" featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/component-base/metrics/legacyregistry" + metricstestutil "k8s.io/component-base/metrics/testutil" "k8s.io/klog/v2" "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache" + "k8s.io/kubernetes/pkg/controller/volume/attachdetach/metrics" "k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater" controllervolumetesting "k8s.io/kubernetes/pkg/controller/volume/attachdetach/testing" "k8s.io/kubernetes/pkg/features" @@ -51,6 +55,8 @@ const ( volumeAttachedCheckTimeout = 5 * time.Second ) +var registerMetrics sync.Once + // Calls Run() // Verifies there are no calls to attach or detach. func Test_Run_Positive_DoNothing(t *testing.T) { @@ -221,6 +227,9 @@ func Test_Run_Positive_OneDesiredVolumeAttachThenDetachWithUnmountedVolume(t *te // Deletes the node/volume/pod tuple from desiredStateOfWorld cache without first marking the node/volume as unmounted. // Verifies there is one detach call and no (new) attach calls. func Test_Run_Positive_OneDesiredVolumeAttachThenDetachWithMountedVolume(t *testing.T) { + registerMetrics.Do(func() { + legacyregistry.MustRegister(metrics.ForceDetachMetricCounter) + }) // Arrange volumePluginMgr, fakePlugin := volumetesting.GetTestVolumePluginMgr(t) dsw := cache.NewDesiredStateOfWorld(volumePluginMgr) @@ -287,6 +296,9 @@ func Test_Run_Positive_OneDesiredVolumeAttachThenDetachWithMountedVolume(t *test waitForAttachCallCount(t, 1 /* expectedAttachCallCount */, fakePlugin) verifyNewDetacherCallCount(t, false /* expectZeroNewDetacherCallCount */, fakePlugin) waitForDetachCallCount(t, 1 /* expectedDetachCallCount */, fakePlugin) + + // Force detach metric due to timeout + testForceDetachMetric(t, 1, metrics.ForceDetachReasonTimeout) } // Populates desiredStateOfWorld cache with one node/volume/pod tuple. @@ -852,6 +864,9 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T // Verifies there is one detach call and no (new) attach calls. func Test_Run_OneVolumeDetachOnOutOfServiceTaintedNode(t *testing.T) { defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeOutOfServiceVolumeDetach, true)() + registerMetrics.Do(func() { + legacyregistry.MustRegister(metrics.ForceDetachMetricCounter) + }) // Arrange volumePluginMgr, fakePlugin := volumetesting.GetTestVolumePluginMgr(t) dsw := cache.NewDesiredStateOfWorld(volumePluginMgr) @@ -920,6 +935,9 @@ func Test_Run_OneVolumeDetachOnOutOfServiceTaintedNode(t *testing.T) { waitForAttachCallCount(t, 1 /* expectedAttachCallCount */, fakePlugin) verifyNewDetacherCallCount(t, false /* expectZeroNewDetacherCallCount */, fakePlugin) waitForDetachCallCount(t, 1 /* expectedDetachCallCount */, fakePlugin) + + // Force detach metric due to out-of-service taint + testForceDetachMetric(t, 1, metrics.ForceDetachReasonOutOfService) } // Populates desiredStateOfWorld cache with one node/volume/pod tuple. @@ -1666,3 +1684,16 @@ func retryWithExponentialBackOff(initialDuration time.Duration, fn wait.Conditio } return wait.ExponentialBackoff(backoff, fn) } + +// verifies the force detach metric with reason +func testForceDetachMetric(t *testing.T, inputForceDetachMetricCounter int, reason string) { + t.Helper() + + actualForceDetachMericCounter, err := metricstestutil.GetCounterMetricValue(metrics.ForceDetachMetricCounter.WithLabelValues(reason)) + if err != nil { + t.Errorf("Error getting actualForceDetachMericCounter") + } + if actualForceDetachMericCounter != float64(inputForceDetachMetricCounter) { + t.Errorf("Expected desiredForceDetachMericCounter to be %d, got %v", inputForceDetachMetricCounter, actualForceDetachMericCounter) + } +}