From 80e279d353c9fa7c77ac6f624f826ed827d76c60 Mon Sep 17 00:00:00 2001
From: Joseph Burnett <josephburnett@google.com>
Date: Tue, 2 Jul 2019 20:15:58 +0200
Subject: [PATCH] Ignore pending pods.

This change adds pending pods to the ignored set first before
selecting pods missing metrics. Pending pods are always ignored when
calculating scale.

When the HPA decides which pods and metric values to take into account
when scaling, it divides the pods into three disjoint subsets: 1)
ready 2) missing metrics and 3) ignored. First the HPA selects pods
which are missing metrics. Then it selects pods should be ignored
because they are not ready yet, or are still consuming CPU during
initialization. All the remaining pods go into the ready set. After
the HPA has decided what direction it wants to scale based on the
ready pods, it considers what might have happened if it had the
missing metrics. It makes a conservative guess about what the missing
metrics might have been, 0% if it wants to scale up--100% if it wants
to scale down. This is a good thing when scaling up, because newly
added pods will likely help reduce the usage ratio, even though their
metrics are missing at the moment. The HPA should wait to see the
results of its previous scale decision before it makes another
one. However when scaling down, it means that many missing metrics can
pin the HPA at high scale, even when load is completely removed. In
particular, when there are many unschedulable pods due to insufficient
cluster capacity, the many missing metrics (assumed to be 100%) can
cause the HPA to avoid scaling down indefinitely.
---
 .../podautoscaler/replica_calculator.go       |  9 ++-
 .../podautoscaler/replica_calculator_test.go  | 61 +++++++++++++++----
 2 files changed, 58 insertions(+), 12 deletions(-)

diff --git a/pkg/controller/podautoscaler/replica_calculator.go b/pkg/controller/podautoscaler/replica_calculator.go
index d479a4edd84..991688de6dd 100644
--- a/pkg/controller/podautoscaler/replica_calculator.go
+++ b/pkg/controller/podautoscaler/replica_calculator.go
@@ -22,7 +22,7 @@ import (
 	"time"
 
 	autoscaling "k8s.io/api/autoscaling/v2beta2"
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/sets"
@@ -365,11 +365,18 @@ func groupPods(pods []*v1.Pod, metrics metricsclient.PodMetricsInfo, resource v1
 		if pod.DeletionTimestamp != nil || pod.Status.Phase == v1.PodFailed {
 			continue
 		}
+		// Pending pods are ignored.
+		if pod.Status.Phase == v1.PodPending {
+			ignoredPods.Insert(pod.Name)
+			continue
+		}
+		// Pods missing metrics.
 		metric, found := metrics[pod.Name]
 		if !found {
 			missingPods.Insert(pod.Name)
 			continue
 		}
+		// Unready pods are ignored.
 		if resource == v1.ResourceCPU {
 			var ignorePod bool
 			_, condition := podutil.GetPodCondition(&pod.Status, v1.PodReady)
diff --git a/pkg/controller/podautoscaler/replica_calculator_test.go b/pkg/controller/podautoscaler/replica_calculator_test.go
index c07d7b2bb5c..22272766038 100644
--- a/pkg/controller/podautoscaler/replica_calculator_test.go
+++ b/pkg/controller/podautoscaler/replica_calculator_test.go
@@ -23,7 +23,7 @@ import (
 	"time"
 
 	autoscalingv2 "k8s.io/api/autoscaling/v2beta2"
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/meta/testrestmapper"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -862,6 +862,25 @@ func TestReplicaCalcScaleDownIncludeUnreadyPods(t *testing.T) {
 	tc.runTest(t)
 }
 
+func TestReplicaCalcScaleDownExcludeUnscheduledPods(t *testing.T) {
+	tc := replicaCalcTestCase{
+		currentReplicas:  5,
+		expectedReplicas: 1,
+		podReadiness:     []v1.ConditionStatus{v1.ConditionTrue, v1.ConditionFalse, v1.ConditionFalse, v1.ConditionFalse, v1.ConditionFalse},
+		podPhase:         []v1.PodPhase{v1.PodRunning, v1.PodPending, v1.PodPending, v1.PodPending, v1.PodPending},
+		resource: &resourceInfo{
+			name:     v1.ResourceCPU,
+			requests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
+			levels:   []int64{100},
+
+			targetUtilization:   50,
+			expectedUtilization: 10,
+			expectedValue:       numContainersPerPod * 100,
+		},
+	}
+	tc.runTest(t)
+}
+
 func TestReplicaCalcScaleDownIgnoreHotCpuPods(t *testing.T) {
 	tc := replicaCalcTestCase{
 		currentReplicas:  5,
@@ -1616,18 +1635,38 @@ func TestGroupPods(t *testing.T) {
 			sets.NewString("lucretius"),
 			sets.NewString("epicurus"),
 		},
+		{
+			name: "pending pods are ignored",
+			pods: []*v1.Pod{
+				{
+					ObjectMeta: metav1.ObjectMeta{
+						Name: "unscheduled",
+					},
+					Status: v1.PodStatus{
+						Phase: v1.PodPending,
+					},
+				},
+			},
+			metrics:             metricsclient.PodMetricsInfo{},
+			resource:            v1.ResourceCPU,
+			expectReadyPodCount: 0,
+			expectIgnoredPods:   sets.NewString("unscheduled"),
+			expectMissingPods:   sets.NewString(),
+		},
 	}
 	for _, tc := range tests {
-		readyPodCount, ignoredPods, missingPods := groupPods(tc.pods, tc.metrics, tc.resource, defaultTestingCpuInitializationPeriod, defaultTestingDelayOfInitialReadinessStatus)
-		if readyPodCount != tc.expectReadyPodCount {
-			t.Errorf("%s got readyPodCount %d, expected %d", tc.name, readyPodCount, tc.expectReadyPodCount)
-		}
-		if !ignoredPods.Equal(tc.expectIgnoredPods) {
-			t.Errorf("%s got unreadyPods %v, expected %v", tc.name, ignoredPods, tc.expectIgnoredPods)
-		}
-		if !missingPods.Equal(tc.expectMissingPods) {
-			t.Errorf("%s got missingPods %v, expected %v", tc.name, missingPods, tc.expectMissingPods)
-		}
+		t.Run(tc.name, func(t *testing.T) {
+			readyPodCount, ignoredPods, missingPods := groupPods(tc.pods, tc.metrics, tc.resource, defaultTestingCpuInitializationPeriod, defaultTestingDelayOfInitialReadinessStatus)
+			if readyPodCount != tc.expectReadyPodCount {
+				t.Errorf("%s got readyPodCount %d, expected %d", tc.name, readyPodCount, tc.expectReadyPodCount)
+			}
+			if !ignoredPods.Equal(tc.expectIgnoredPods) {
+				t.Errorf("%s got unreadyPods %v, expected %v", tc.name, ignoredPods, tc.expectIgnoredPods)
+			}
+			if !missingPods.Equal(tc.expectMissingPods) {
+				t.Errorf("%s got missingPods %v, expected %v", tc.name, missingPods, tc.expectMissingPods)
+			}
+		})
 	}
 }