Merge pull request #54377 from gnufied/fix-detach-metric-flake

Automatic merge from submit-queue (batch tested with PRs 54107, 54184, 54377, 54094, 54111). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Fix detach metric flake by not using exact equals Also poll for detach value increase. Fixes https://github.com/kubernetes/kubernetes/issues/52871 I have ran these tests for more than 3 hours in a tight loop and did not see it flake. The changes here include dropping exact equality test and making sure we poll for increase in detach metric count. ```release-note None ```
2025-09-13 21:25:09 +00:00 · 2017-10-24 15:59:09 -07:00
parent 4ca155cbc1 83717f5286
commit 47b4f0ed8a
1 changed files with 48 additions and 28 deletions
--- a/test/e2e/storage/volume_metrics.go
+++ b/test/e2e/storage/volume_metrics.go
@@ -97,33 +97,9 @@ var _ = SIGDescribe("[Serial] Volume metrics", func() {
 		framework.Logf("Deleting pod %q/%q", pod.Namespace, pod.Name)
 		framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod))

-		backoff := wait.Backoff{
-			Duration: 10 * time.Second,
-			Factor:   1.2,
-			Steps:    21,
-		}
+		updatedStorageMetrics := waitForDetachAndGrabMetrics(storageOpMetrics, metricsGrabber)

-		updatedStorageMetrics := make(map[string]int64)
-
-		waitErr := wait.ExponentialBackoff(backoff, func() (bool, error) {
-			updatedMetrics, err := metricsGrabber.GrabFromControllerManager()
-
-			if err != nil {
-				framework.Logf("Error fetching controller-manager metrics")
-				return false, err
-			}
-			updatedStorageMetrics = getControllerStorageMetrics(updatedMetrics)
-			metricCount := len(updatedStorageMetrics)
-			// Usually a pod deletion does not mean immediate volume detach
-			// we will have to retry to verify volume_detach metrics
-			_, detachMetricFound := updatedStorageMetrics["volume_detach"]
-			if metricCount < 3 || !detachMetricFound {
-				framework.Logf("Volume metrics not collected yet, going to retry")
-				return false, nil
-			}
-			return true, nil
-		})
-		Expect(waitErr).NotTo(HaveOccurred(), "Error fetching storage c-m metrics : %v", waitErr)
+		Expect(len(updatedStorageMetrics)).ToNot(Equal(0), "Error fetching c-m updated storage metrics")

 		volumeOperations := []string{"volume_provision", "volume_detach", "volume_attach"}

@@ -190,6 +166,48 @@ var _ = SIGDescribe("[Serial] Volume metrics", func() {
 	})
 })

+func waitForDetachAndGrabMetrics(oldMetrics map[string]int64, metricsGrabber *metrics.MetricsGrabber) map[string]int64 {
+	backoff := wait.Backoff{
+		Duration: 10 * time.Second,
+		Factor:   1.2,
+		Steps:    21,
+	}
+
+	updatedStorageMetrics := make(map[string]int64)
+	oldDetachCount, ok := oldMetrics["volume_detach"]
+	if !ok {
+		oldDetachCount = 0
+	}
+
+	verifyMetricFunc := func() (bool, error) {
+		updatedMetrics, err := metricsGrabber.GrabFromControllerManager()
+
+		if err != nil {
+			framework.Logf("Error fetching controller-manager metrics")
+			return false, err
+		}
+
+		updatedStorageMetrics = getControllerStorageMetrics(updatedMetrics)
+		newDetachCount, ok := updatedStorageMetrics["volume_detach"]
+
+		// if detach metrics are not yet there, we need to retry
+		if !ok {
+			return false, nil
+		}
+
+		// if old Detach count is more or equal to new detach count, that means detach
+		// event has not been observed yet.
+		if oldDetachCount >= newDetachCount {
+			return false, nil
+		}
+		return true, nil
+	}
+
+	waitErr := wait.ExponentialBackoff(backoff, verifyMetricFunc)
+	Expect(waitErr).NotTo(HaveOccurred(), "Timeout error fetching storage c-m metrics : %v", waitErr)
+	return updatedStorageMetrics
+}
+
 func verifyMetricCount(oldMetrics map[string]int64, newMetrics map[string]int64, metricName string) {
 	oldCount, ok := oldMetrics[metricName]
 	// if metric does not exist in oldMap, it probably hasn't been emitted yet.
@@ -199,8 +217,10 @@ func verifyMetricCount(oldMetrics map[string]int64, newMetrics map[string]int64,

 	newCount, ok := newMetrics[metricName]
 	Expect(ok).To(BeTrue(), "Error getting updated metrics for %s", metricName)
-
-	Expect(oldCount + 1).To(Equal(newCount))
+	// It appears that in a busy cluster some spurious detaches are unavoidable
+	// even if the test is run serially.  We really just verify if new count
+	// is greater than old count
+	Expect(newCount).To(BeNumerically(">", oldCount), "New count %d should be more than old count %d for action %s", newCount, oldCount, metricName)
 }

 func getControllerStorageMetrics(ms metrics.ControllerManagerMetrics) map[string]int64 {