Merge pull request #127193 from DP19/ignore-unready-pods-hpa-containermetrics

allow ContainerResource calculations to continue with missing metrics like Resource calculations
2025-09-23 19:08:44 +00:00 · 2024-12-13 15:18:25 +01:00
parent e8312395f0 f97abdbee0
commit 77749c21f6
4 changed files with 41 additions and 22 deletions
--- a/pkg/controller/podautoscaler/metrics/client.go
+++ b/pkg/controller/podautoscaler/metrics/client.go
@@ -75,10 +75,7 @@ func (c *resourceMetricsClient) GetResourceMetric(ctx context.Context, resource
 	}
 	var res PodMetricsInfo
 	if container != "" {
-		res, err = getContainerMetrics(metrics.Items, resource, container)
-		if err != nil {
-			return nil, time.Time{}, fmt.Errorf("failed to get container metrics: %v", err)
-		}
+		res = getContainerMetrics(ctx, metrics.Items, resource, container)
 	} else {
 		res = getPodMetrics(ctx, metrics.Items, resource)
 	}
@@ -86,7 +83,7 @@ func (c *resourceMetricsClient) GetResourceMetric(ctx context.Context, resource
 	return res, timestamp, nil
 }

-func getContainerMetrics(rawMetrics []metricsapi.PodMetrics, resource v1.ResourceName, container string) (PodMetricsInfo, error) {
+func getContainerMetrics(ctx context.Context, rawMetrics []metricsapi.PodMetrics, resource v1.ResourceName, container string) PodMetricsInfo {
 	res := make(PodMetricsInfo, len(rawMetrics))
 	for _, m := range rawMetrics {
 		containerFound := false
@@ -104,10 +101,10 @@ func getContainerMetrics(rawMetrics []metricsapi.PodMetrics, resource v1.Resourc
 			}
 		}
 		if !containerFound {
-			return nil, fmt.Errorf("container %s not present in metrics for pod %s/%s", container, m.Namespace, m.Name)
+			klog.FromContext(ctx).V(2).Info("Missing container metric", "container", container, "pod", klog.KRef(m.Namespace, m.Name))
 		}
 	}
-	return res, nil
+	return res
 }

 func getPodMetrics(ctx context.Context, rawMetrics []metricsapi.PodMetrics, resource v1.ResourceName) PodMetricsInfo {
--- a/pkg/controller/podautoscaler/metrics/client_test.go
+++ b/pkg/controller/podautoscaler/metrics/client_test.go
@@ -429,7 +429,22 @@ func TestRESTClientContainerCPUEmptyMetricsForOnePod(t *testing.T) {
 		container:          "test-1",
 		targetTimestamp:    targetTimestamp,
 		window:             window,
-		desiredError:       fmt.Errorf("failed to get container metrics"),
+		reportedPodMetrics: []map[string]int64{{"test-1": 100}, {"test-1": 300, "test-2": 400}, {}},
+	}
+	tc.runTest(t)
+}
+
+func TestRESTClientContainerCPUEmptyMetricsForOnePodReturnsOnlyFoundContainer(t *testing.T) {
+	targetTimestamp := 1
+	window := 30 * time.Second
+	tc := restClientTestCase{
+		resourceName: v1.ResourceCPU,
+		desiredMetricValues: PodMetricsInfo{
+			"test-pod-1": {Value: 400, Timestamp: offsetTimestampBy(targetTimestamp), Window: window},
+		},
+		container:          "test-2",
+		targetTimestamp:    targetTimestamp,
+		window:             window,
 		reportedPodMetrics: []map[string]int64{{"test-1": 100}, {"test-1": 300, "test-2": 400}, {}},
 	}
 	tc.runTest(t)
--- a/pkg/controller/podautoscaler/metrics/interfaces.go
+++ b/pkg/controller/podautoscaler/metrics/interfaces.go
@@ -41,6 +41,7 @@ type MetricsClient interface {
 	// GetResourceMetric gets the given resource metric (and an associated oldest timestamp)
 	// for the specified named container in all pods matching the specified selector in the given namespace and when
 	// the container is an empty string it returns the sum of all the container metrics.
+	// Missing metrics will not error and callers should rely on Pod status to filter metrics info returned from this interface.
 	GetResourceMetric(ctx context.Context, resource v1.ResourceName, namespace string, selector labels.Selector, container string) (PodMetricsInfo, time.Time, error)

 	// GetRawMetric gets the given metric (and an associated oldest timestamp)
--- a/pkg/controller/podautoscaler/replica_calculator_test.go
+++ b/pkg/controller/podautoscaler/replica_calculator_test.go
@@ -446,20 +446,6 @@ func TestReplicaCalcDisjointResourcesMetrics(t *testing.T) {
 	tc.runTest(t)
 }

-func TestReplicaCalcMissingContainerMetricError(t *testing.T) {
-	tc := replicaCalcTestCase{
-		currentReplicas: 1,
-		expectedError:   fmt.Errorf("container container2 not present in metrics for pod test-namespace/test-pod-0"),
-		resource: &resourceInfo{
-			name:     v1.ResourceCPU,
-			requests: []resource.Quantity{resource.MustParse("1.0")},
-			levels:   [][]int64{{0}},
-		},
-		container: "container2",
-	}
-	tc.runTest(t)
-}
-
 func TestReplicaCalcScaleUp(t *testing.T) {
 	tc := replicaCalcTestCase{
 		currentReplicas:  3,
@@ -606,6 +592,26 @@ func TestReplicaCalcScaleUpIgnoresFailedPods(t *testing.T) {
 	tc.runTest(t)
 }

+func TestReplicaCalcMissingContainerMetricScaleUpIgnoresFailedPods(t *testing.T) {
+	tc := replicaCalcTestCase{
+		currentReplicas:  2,
+		expectedReplicas: 4,
+		podReadiness:     []v1.ConditionStatus{v1.ConditionTrue, v1.ConditionTrue, v1.ConditionFalse, v1.ConditionFalse},
+		podPhase:         []v1.PodPhase{v1.PodRunning, v1.PodRunning, v1.PodFailed, v1.PodFailed},
+		resource: &resourceInfo{
+			name:     v1.ResourceCPU,
+			requests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
+			levels:   [][]int64{{1000, 500}, {9000, 700}, {1000}, {9000}},
+
+			targetUtilization:   30,
+			expectedUtilization: 60,
+			expectedValue:       600,
+		},
+		container: "container2",
+	}
+	tc.runTest(t)
+}
+
 func TestReplicaCalcScaleUpContainerIgnoresFailedPods(t *testing.T) {
 	tc := replicaCalcTestCase{
 		currentReplicas:  2,