Merge pull request #54377 from gnufied/fix-detach-metric-flake

Automatic merge from submit-queue (batch tested with PRs 54107, 54184, 54377, 54094, 54111). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Fix detach metric flake by not using exact equals Also poll for detach value increase. Fixes https://github.com/kubernetes/kubernetes/issues/52871 I have ran these tests for more than 3 hours in a tight loop and did not see it flake. The changes here include dropping exact equality test and making sure we poll for increase in detach metric count. ```release-note None ```
2025-08-01 07:47:56 +00:00 · 2017-10-24 15:59:09 -07:00 · 2017-10-24 15:59:09 -07:00 · 47b4f0ed8a
commit 47b4f0ed8a
parent 4ca155cbc1 83717f5286
1 changed files with 48 additions and 28 deletions
--- a/test/e2e/storage/volume_metrics.go
+++ b/test/e2e/storage/volume_metrics.go
@ -97,33 +97,9 @@ var _ = SIGDescribe("[Serial] Volume metrics", func() {
 		framework.Logf("Deleting pod %q/%q", pod.Namespace, pod.Name)
 		framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod))
-		backoff := wait.Backoff{
+		updatedStorageMetrics := waitForDetachAndGrabMetrics(storageOpMetrics, metricsGrabber)
 			Duration: 10 * time.Second,
 			Factor:   1.2,
 			Steps:    21,
 		}
-		updatedStorageMetrics := make(map[string]int64)
+		Expect(len(updatedStorageMetrics)).ToNot(Equal(0), "Error fetching c-m updated storage metrics")
 		waitErr := wait.ExponentialBackoff(backoff, func() (bool, error) {
 			updatedMetrics, err := metricsGrabber.GrabFromControllerManager()
 			if err != nil {
 				framework.Logf("Error fetching controller-manager metrics")
 				return false, err
 			}
 			updatedStorageMetrics = getControllerStorageMetrics(updatedMetrics)
 			metricCount := len(updatedStorageMetrics)
 			// Usually a pod deletion does not mean immediate volume detach
 			// we will have to retry to verify volume_detach metrics
 			_, detachMetricFound := updatedStorageMetrics["volume_detach"]
 			if metricCount < 3 || !detachMetricFound {
 				framework.Logf("Volume metrics not collected yet, going to retry")
 				return false, nil
 			}
 			return true, nil
 		})
 		Expect(waitErr).NotTo(HaveOccurred(), "Error fetching storage c-m metrics : %v", waitErr)
 		volumeOperations := []string{"volume_provision", "volume_detach", "volume_attach"}
@ -190,6 +166,48 @@ var _ = SIGDescribe("[Serial] Volume metrics", func() {
 	})
 })
 func waitForDetachAndGrabMetrics(oldMetrics map[string]int64, metricsGrabber *metrics.MetricsGrabber) map[string]int64 {
 	backoff := wait.Backoff{
 		Duration: 10 * time.Second,
 		Factor:   1.2,
 		Steps:    21,
 	}
 	updatedStorageMetrics := make(map[string]int64)
 	oldDetachCount, ok := oldMetrics["volume_detach"]
 	if !ok {
 		oldDetachCount = 0
 	}
 	verifyMetricFunc := func() (bool, error) {
 		updatedMetrics, err := metricsGrabber.GrabFromControllerManager()
 		if err != nil {
 			framework.Logf("Error fetching controller-manager metrics")
 			return false, err
 		}
 		updatedStorageMetrics = getControllerStorageMetrics(updatedMetrics)
 		newDetachCount, ok := updatedStorageMetrics["volume_detach"]
 		// if detach metrics are not yet there, we need to retry
 		if !ok {
 			return false, nil
 		}
 		// if old Detach count is more or equal to new detach count, that means detach
 		// event has not been observed yet.
 		if oldDetachCount >= newDetachCount {
 			return false, nil
 		}
 		return true, nil
 	}
 	waitErr := wait.ExponentialBackoff(backoff, verifyMetricFunc)
 	Expect(waitErr).NotTo(HaveOccurred(), "Timeout error fetching storage c-m metrics : %v", waitErr)
 	return updatedStorageMetrics
 }
 func verifyMetricCount(oldMetrics map[string]int64, newMetrics map[string]int64, metricName string) {
 	oldCount, ok := oldMetrics[metricName]
 	// if metric does not exist in oldMap, it probably hasn't been emitted yet.
@ -199,8 +217,10 @@ func verifyMetricCount(oldMetrics map[string]int64, newMetrics map[string]int64,
 	newCount, ok := newMetrics[metricName]
 	Expect(ok).To(BeTrue(), "Error getting updated metrics for %s", metricName)
-
+	// It appears that in a busy cluster some spurious detaches are unavoidable
-	Expect(oldCount + 1).To(Equal(newCount))
+	// even if the test is run serially.  We really just verify if new count
 	// is greater than old count
 	Expect(newCount).To(BeNumerically(">", oldCount), "New count %d should be more than old count %d for action %s", newCount, oldCount, metricName)
 }
 func getControllerStorageMetrics(ms metrics.ControllerManagerMetrics) map[string]int64 {