Merge pull request #60886 from mattjmcnaughton/mattjmcnaughton/59975-object-metrics-ignore-unready-pods

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Ignore unready pods when calculating desired replicas

**What this PR does / why we need it**:

This PR causes `GetExternalMetricReplicas` and `GetObjectMetricReplicas` to ignore unready pods when computing the number of desired replicas. If we don't ignore unready pods, there is a risk of overscaling. See the commit messages for examples and implementation info.

**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
Fixes #59975 

**Special notes for your reviewer**:
@MaciekPytel and I consciously chose to save `GetExternalPerPodMetricReplicas` for a separate PR, as we aren't definite on what is the preferred behavior.

**Release note**:

```release-note
Unready pods will no longer impact the number of desired replicas when using horizontal auto-scaling with external metrics or object metrics.
```
This commit is contained in:
Kubernetes Submit Queue 2018-03-21 01:34:23 -07:00 committed by GitHub
commit a7d788d91f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 115 additions and 24 deletions

View File

@ -223,7 +223,7 @@ func (a *HorizontalController) computeReplicasForMetrics(hpa *autoscalingv2.Hori
switch metricSpec.Type { switch metricSpec.Type {
case autoscalingv2.ObjectMetricSourceType: case autoscalingv2.ObjectMetricSourceType:
replicaCountProposal, utilizationProposal, timestampProposal, err = a.replicaCalc.GetObjectMetricReplicas(currentReplicas, metricSpec.Object.TargetValue.MilliValue(), metricSpec.Object.MetricName, hpa.Namespace, &metricSpec.Object.Target) replicaCountProposal, utilizationProposal, timestampProposal, err = a.replicaCalc.GetObjectMetricReplicas(currentReplicas, metricSpec.Object.TargetValue.MilliValue(), metricSpec.Object.MetricName, hpa.Namespace, &metricSpec.Object.Target, selector)
if err != nil { if err != nil {
a.eventRecorder.Event(hpa, v1.EventTypeWarning, "FailedGetObjectMetric", err.Error()) a.eventRecorder.Event(hpa, v1.EventTypeWarning, "FailedGetObjectMetric", err.Error())
setCondition(hpa, autoscalingv2.ScalingActive, v1.ConditionFalse, "FailedGetObjectMetric", "the HPA was unable to compute the replica count: %v", err) setCondition(hpa, autoscalingv2.ScalingActive, v1.ConditionFalse, "FailedGetObjectMetric", "the HPA was unable to compute the replica count: %v", err)
@ -317,7 +317,7 @@ func (a *HorizontalController) computeReplicasForMetrics(hpa *autoscalingv2.Hori
}, },
} }
} else if metricSpec.External.TargetValue != nil { } else if metricSpec.External.TargetValue != nil {
replicaCountProposal, utilizationProposal, timestampProposal, err = a.replicaCalc.GetExternalMetricReplicas(currentReplicas, metricSpec.External.TargetValue.MilliValue(), metricSpec.External.MetricName, hpa.Namespace, metricSpec.External.MetricSelector) replicaCountProposal, utilizationProposal, timestampProposal, err = a.replicaCalc.GetExternalMetricReplicas(currentReplicas, metricSpec.External.TargetValue.MilliValue(), metricSpec.External.MetricName, hpa.Namespace, metricSpec.External.MetricSelector, selector)
if err != nil { if err != nil {
a.eventRecorder.Event(hpa, v1.EventTypeWarning, "FailedGetExternalMetric", err.Error()) a.eventRecorder.Event(hpa, v1.EventTypeWarning, "FailedGetExternalMetric", err.Error())
setCondition(hpa, autoscalingv2.ScalingActive, v1.ConditionFalse, "FailedGetExternalMetric", "the HPA was unable to compute the replica count: %v", err) setCondition(hpa, autoscalingv2.ScalingActive, v1.ConditionFalse, "FailedGetExternalMetric", "the HPA was unable to compute the replica count: %v", err)

View File

@ -246,16 +246,32 @@ func (tc *testCase) prepareTestClient(t *testing.T) (*fake.Clientset, *metricsfa
defer tc.Unlock() defer tc.Unlock()
obj := &v1.PodList{} obj := &v1.PodList{}
for i := 0; i < len(tc.reportedCPURequests); i++ {
specifiedCPURequests := tc.reportedCPURequests != nil
numPodsToCreate := int(tc.initialReplicas)
if specifiedCPURequests {
numPodsToCreate = len(tc.reportedCPURequests)
}
for i := 0; i < numPodsToCreate; i++ {
podReadiness := v1.ConditionTrue podReadiness := v1.ConditionTrue
if tc.reportedPodReadiness != nil { if tc.reportedPodReadiness != nil {
podReadiness = tc.reportedPodReadiness[i] podReadiness = tc.reportedPodReadiness[i]
} }
podPhase := v1.PodRunning podPhase := v1.PodRunning
if tc.reportedPodPhase != nil { if tc.reportedPodPhase != nil {
podPhase = tc.reportedPodPhase[i] podPhase = tc.reportedPodPhase[i]
} }
podName := fmt.Sprintf("%s-%d", podNamePrefix, i) podName := fmt.Sprintf("%s-%d", podNamePrefix, i)
reportedCPURequest := resource.MustParse("1.0")
if specifiedCPURequests {
reportedCPURequest = tc.reportedCPURequests[i]
}
pod := v1.Pod{ pod := v1.Pod{
Status: v1.PodStatus{ Status: v1.PodStatus{
Phase: podPhase, Phase: podPhase,
@ -273,12 +289,13 @@ func (tc *testCase) prepareTestClient(t *testing.T) (*fake.Clientset, *metricsfa
"name": podNamePrefix, "name": podNamePrefix,
}, },
}, },
Spec: v1.PodSpec{ Spec: v1.PodSpec{
Containers: []v1.Container{ Containers: []v1.Container{
{ {
Resources: v1.ResourceRequirements{ Resources: v1.ResourceRequirements{
Requests: v1.ResourceList{ Requests: v1.ResourceList{
v1.ResourceCPU: tc.reportedCPURequests[i], v1.ResourceCPU: reportedCPURequest,
}, },
}, },
}, },
@ -1348,13 +1365,14 @@ func TestEmptyMetrics(t *testing.T) {
func TestEmptyCPURequest(t *testing.T) { func TestEmptyCPURequest(t *testing.T) {
tc := testCase{ tc := testCase{
minReplicas: 1, minReplicas: 1,
maxReplicas: 5, maxReplicas: 5,
initialReplicas: 1, initialReplicas: 1,
desiredReplicas: 1, desiredReplicas: 1,
CPUTarget: 100, CPUTarget: 100,
reportedLevels: []uint64{200}, reportedLevels: []uint64{200},
useMetricsAPI: true, reportedCPURequests: []resource.Quantity{},
useMetricsAPI: true,
expectedConditions: []autoscalingv1.HorizontalPodAutoscalerCondition{ expectedConditions: []autoscalingv1.HorizontalPodAutoscalerCondition{
{Type: autoscalingv1.AbleToScale, Status: v1.ConditionTrue, Reason: "SucceededGetScale"}, {Type: autoscalingv1.AbleToScale, Status: v1.ConditionTrue, Reason: "SucceededGetScale"},
{Type: autoscalingv1.ScalingActive, Status: v1.ConditionFalse, Reason: "FailedGetResourceMetric"}, {Type: autoscalingv1.ScalingActive, Status: v1.ConditionFalse, Reason: "FailedGetResourceMetric"},

View File

@ -276,7 +276,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
// GetObjectMetricReplicas calculates the desired replica count based on a target metric utilization (as a milli-value) // GetObjectMetricReplicas calculates the desired replica count based on a target metric utilization (as a milli-value)
// for the given object in the given namespace, and the current replica count. // for the given object in the given namespace, and the current replica count.
func (c *ReplicaCalculator) GetObjectMetricReplicas(currentReplicas int32, targetUtilization int64, metricName string, namespace string, objectRef *autoscaling.CrossVersionObjectReference) (replicaCount int32, utilization int64, timestamp time.Time, err error) { func (c *ReplicaCalculator) GetObjectMetricReplicas(currentReplicas int32, targetUtilization int64, metricName string, namespace string, objectRef *autoscaling.CrossVersionObjectReference, selector labels.Selector) (replicaCount int32, utilization int64, timestamp time.Time, err error) {
utilization, timestamp, err = c.metricsClient.GetObjectMetric(metricName, namespace, objectRef) utilization, timestamp, err = c.metricsClient.GetObjectMetric(metricName, namespace, objectRef)
if err != nil { if err != nil {
return 0, 0, time.Time{}, fmt.Errorf("unable to get metric %s: %v on %s %s/%s", metricName, objectRef.Kind, namespace, objectRef.Name, err) return 0, 0, time.Time{}, fmt.Errorf("unable to get metric %s: %v on %s %s/%s", metricName, objectRef.Kind, namespace, objectRef.Name, err)
@ -287,48 +287,85 @@ func (c *ReplicaCalculator) GetObjectMetricReplicas(currentReplicas int32, targe
// return the current replicas if the change would be too small // return the current replicas if the change would be too small
return currentReplicas, utilization, timestamp, nil return currentReplicas, utilization, timestamp, nil
} }
replicaCount = int32(math.Ceil(usageRatio * float64(currentReplicas)))
readyPodCount, err := c.getReadyPodsCount(namespace, selector)
if err != nil {
return 0, 0, time.Time{}, fmt.Errorf("unable to calculate ready pods: %s", err)
}
replicaCount = int32(math.Ceil(usageRatio * float64(readyPodCount)))
return replicaCount, utilization, timestamp, nil return replicaCount, utilization, timestamp, nil
} }
// @TODO(mattjmcnaughton) Many different functions in this module use variations
// of this function. Make this function generic, so we don't repeat the same
// logic in multiple places.
func (c *ReplicaCalculator) getReadyPodsCount(namespace string, selector labels.Selector) (int64, error) {
podList, err := c.podsGetter.Pods(namespace).List(metav1.ListOptions{LabelSelector: selector.String()})
if err != nil {
return 0, fmt.Errorf("unable to get pods while calculating replica count: %v", err)
}
if len(podList.Items) == 0 {
return 0, fmt.Errorf("no pods returned by selector while calculating replica count")
}
readyPodCount := 0
for _, pod := range podList.Items {
if pod.Status.Phase == v1.PodRunning && podutil.IsPodReady(&pod) {
readyPodCount++
}
}
return int64(readyPodCount), nil
}
// GetExternalMetricReplicas calculates the desired replica count based on a // GetExternalMetricReplicas calculates the desired replica count based on a
// target metric value (as a milli-value) for the external metric in the given // target metric value (as a milli-value) for the external metric in the given
// namespace, and the current replica count. // namespace, and the current replica count.
func (c *ReplicaCalculator) GetExternalMetricReplicas(currentReplicas int32, targetUtilization int64, metricName, namespace string, selector *metav1.LabelSelector) (replicaCount int32, utilization int64, timestamp time.Time, err error) { func (c *ReplicaCalculator) GetExternalMetricReplicas(currentReplicas int32, targetUtilization int64, metricName, namespace string, metricSelector *metav1.LabelSelector, podSelector labels.Selector) (replicaCount int32, utilization int64, timestamp time.Time, err error) {
labelSelector, err := metav1.LabelSelectorAsSelector(selector) metricLabelSelector, err := metav1.LabelSelectorAsSelector(metricSelector)
if err != nil { if err != nil {
return 0, 0, time.Time{}, err return 0, 0, time.Time{}, err
} }
metrics, timestamp, err := c.metricsClient.GetExternalMetric(metricName, namespace, labelSelector) metrics, timestamp, err := c.metricsClient.GetExternalMetric(metricName, namespace, metricLabelSelector)
if err != nil { if err != nil {
return 0, 0, time.Time{}, fmt.Errorf("unable to get external metric %s/%s/%+v: %s", namespace, metricName, selector, err) return 0, 0, time.Time{}, fmt.Errorf("unable to get external metric %s/%s/%+v: %s", namespace, metricName, metricSelector, err)
} }
utilization = 0 utilization = 0
for _, val := range metrics { for _, val := range metrics {
utilization = utilization + val utilization = utilization + val
} }
readyPodCount, err := c.getReadyPodsCount(namespace, podSelector)
if err != nil {
return 0, 0, time.Time{}, fmt.Errorf("unable to calculate ready pods: %s", err)
}
usageRatio := float64(utilization) / float64(targetUtilization) usageRatio := float64(utilization) / float64(targetUtilization)
if math.Abs(1.0-usageRatio) <= c.tolerance { if math.Abs(1.0-usageRatio) <= c.tolerance {
// return the current replicas if the change would be too small // return the current replicas if the change would be too small
return currentReplicas, utilization, timestamp, nil return currentReplicas, utilization, timestamp, nil
} }
return int32(math.Ceil(usageRatio * float64(currentReplicas))), utilization, timestamp, nil return int32(math.Ceil(usageRatio * float64(readyPodCount))), utilization, timestamp, nil
} }
// GetExternalPerPodMetricReplicas calculates the desired replica count based on a // GetExternalPerPodMetricReplicas calculates the desired replica count based on a
// target metric value per pod (as a milli-value) for the external metric in the // target metric value per pod (as a milli-value) for the external metric in the
// given namespace, and the current replica count. // given namespace, and the current replica count.
func (c *ReplicaCalculator) GetExternalPerPodMetricReplicas(currentReplicas int32, targetUtilizationPerPod int64, metricName, namespace string, selector *metav1.LabelSelector) (replicaCount int32, utilization int64, timestamp time.Time, err error) { func (c *ReplicaCalculator) GetExternalPerPodMetricReplicas(currentReplicas int32, targetUtilizationPerPod int64, metricName, namespace string, metricSelector *metav1.LabelSelector) (replicaCount int32, utilization int64, timestamp time.Time, err error) {
labelSelector, err := metav1.LabelSelectorAsSelector(selector) metricLabelSelector, err := metav1.LabelSelectorAsSelector(metricSelector)
if err != nil { if err != nil {
return 0, 0, time.Time{}, err return 0, 0, time.Time{}, err
} }
metrics, timestamp, err := c.metricsClient.GetExternalMetric(metricName, namespace, labelSelector) metrics, timestamp, err := c.metricsClient.GetExternalMetric(metricName, namespace, metricLabelSelector)
if err != nil { if err != nil {
return 0, 0, time.Time{}, fmt.Errorf("unable to get external metric %s/%s/%+v: %s", namespace, metricName, selector, err) return 0, 0, time.Time{}, fmt.Errorf("unable to get external metric %s/%s/%+v: %s", namespace, metricName, metricSelector, err)
} }
utilization = 0 utilization = 0
for _, val := range metrics { for _, val := range metrics {

View File

@ -323,10 +323,10 @@ func (tc *replicaCalcTestCase) runTest(t *testing.T) {
var outTimestamp time.Time var outTimestamp time.Time
var err error var err error
if tc.metric.singleObject != nil { if tc.metric.singleObject != nil {
outReplicas, outUtilization, outTimestamp, err = replicaCalc.GetObjectMetricReplicas(tc.currentReplicas, tc.metric.targetUtilization, tc.metric.name, testNamespace, tc.metric.singleObject) outReplicas, outUtilization, outTimestamp, err = replicaCalc.GetObjectMetricReplicas(tc.currentReplicas, tc.metric.targetUtilization, tc.metric.name, testNamespace, tc.metric.singleObject, selector)
} else if tc.metric.selector != nil { } else if tc.metric.selector != nil {
if tc.metric.targetUtilization > 0 { if tc.metric.targetUtilization > 0 {
outReplicas, outUtilization, outTimestamp, err = replicaCalc.GetExternalMetricReplicas(tc.currentReplicas, tc.metric.targetUtilization, tc.metric.name, testNamespace, tc.metric.selector) outReplicas, outUtilization, outTimestamp, err = replicaCalc.GetExternalMetricReplicas(tc.currentReplicas, tc.metric.targetUtilization, tc.metric.name, testNamespace, tc.metric.selector, selector)
} else if tc.metric.perPodTargetUtilization > 0 { } else if tc.metric.perPodTargetUtilization > 0 {
outReplicas, outUtilization, outTimestamp, err = replicaCalc.GetExternalPerPodMetricReplicas(tc.currentReplicas, tc.metric.perPodTargetUtilization, tc.metric.name, testNamespace, tc.metric.selector) outReplicas, outUtilization, outTimestamp, err = replicaCalc.GetExternalPerPodMetricReplicas(tc.currentReplicas, tc.metric.perPodTargetUtilization, tc.metric.name, testNamespace, tc.metric.selector)
} }
@ -497,6 +497,26 @@ func TestReplicaCalcScaleUpCMObject(t *testing.T) {
tc.runTest(t) tc.runTest(t)
} }
func TestReplicaCalcScaleUpCMObjectIgnoresUnreadyPods(t *testing.T) {
tc := replicaCalcTestCase{
currentReplicas: 3,
expectedReplicas: 5, // If we did not ignore unready pods, we'd expect 15 replicas.
podReadiness: []v1.ConditionStatus{v1.ConditionFalse, v1.ConditionTrue, v1.ConditionFalse},
metric: &metricInfo{
name: "qps",
levels: []int64{50000},
targetUtilization: 10000,
expectedUtilization: 50000,
singleObject: &autoscalingv2.CrossVersionObjectReference{
Kind: "Deployment",
APIVersion: "extensions/v1beta1",
Name: "some-deployment",
},
},
}
tc.runTest(t)
}
func TestReplicaCalcScaleUpCMExternal(t *testing.T) { func TestReplicaCalcScaleUpCMExternal(t *testing.T) {
tc := replicaCalcTestCase{ tc := replicaCalcTestCase{
currentReplicas: 1, currentReplicas: 1,
@ -512,6 +532,22 @@ func TestReplicaCalcScaleUpCMExternal(t *testing.T) {
tc.runTest(t) tc.runTest(t)
} }
func TestReplicaCalcScaleUpCMExternalIgnoresUnreadyPods(t *testing.T) {
tc := replicaCalcTestCase{
currentReplicas: 3,
expectedReplicas: 2, // Would expect 6 if we didn't ignore unready pods
podReadiness: []v1.ConditionStatus{v1.ConditionFalse, v1.ConditionTrue, v1.ConditionFalse},
metric: &metricInfo{
name: "qps",
levels: []int64{8600},
targetUtilization: 4400,
expectedUtilization: 8600,
selector: &metav1.LabelSelector{MatchLabels: map[string]string{"label": "value"}},
},
}
tc.runTest(t)
}
func TestReplicaCalcScaleUpCMExternalNoLabels(t *testing.T) { func TestReplicaCalcScaleUpCMExternalNoLabels(t *testing.T) {
tc := replicaCalcTestCase{ tc := replicaCalcTestCase{
currentReplicas: 1, currentReplicas: 1,