diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 8ca76a2b6f2..9c623e5d047 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -22,6 +22,7 @@ import ( "encoding/json" "fmt" "hash/fnv" + "math" "sync" "sync/atomic" "time" @@ -807,6 +808,10 @@ func (s ActivePods) Less(i, j int) bool { // 8. If the pods' creation times differ, the pod that was created more recently // comes before the older pod. // +// In 6 and 8, times are compared in a logarithmic scale. This allows a level +// of randomness among equivalent Pods when sorting. If two pods have the same +// logarithmic rank, they are sorted by UUID to provide a pseudorandom order. +// // If none of these rules matches, the second pod comes before the first pod. // // The intention of this ordering is to put pods that should be preferred for @@ -819,6 +824,10 @@ type ActivePodsWithRanks struct { // comparing two pods that are both scheduled, in the same phase, and // having the same ready status. Rank []int + + // Now is a reference timestamp for doing logarithmic timestamp comparisons. + // If zero, comparison happens without scaling. + Now metav1.Time } func (s ActivePodsWithRanks) Len() int { @@ -872,7 +881,18 @@ func (s ActivePodsWithRanks) Less(i, j int) bool { readyTime1 := podReadyTime(s.Pods[i]) readyTime2 := podReadyTime(s.Pods[j]) if !readyTime1.Equal(readyTime2) { - return afterOrZero(readyTime1, readyTime2) + if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) { + return afterOrZero(readyTime1, readyTime2) + } else { + if s.Now.IsZero() || readyTime1.IsZero() || readyTime2.IsZero() { + return afterOrZero(readyTime1, readyTime2) + } + rankDiff := logarithmicRankDiff(*readyTime1, *readyTime2, s.Now) + if rankDiff == 0 { + return s.Pods[i].UID < s.Pods[j].UID + } + return rankDiff < 0 + } } } // 7. Pods with containers with higher restart counts < lower restart counts @@ -881,7 +901,18 @@ func (s ActivePodsWithRanks) Less(i, j int) bool { } // 8. Empty creation time pods < newer pods < older pods if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) { - return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp) + if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) { + return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp) + } else { + if s.Now.IsZero() || s.Pods[i].CreationTimestamp.IsZero() || s.Pods[j].CreationTimestamp.IsZero() { + return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp) + } + rankDiff := logarithmicRankDiff(s.Pods[i].CreationTimestamp, s.Pods[j].CreationTimestamp, s.Now) + if rankDiff == 0 { + return s.Pods[i].UID < s.Pods[j].UID + } + return rankDiff < 0 + } } return false } @@ -895,6 +926,22 @@ func afterOrZero(t1, t2 *metav1.Time) bool { return t1.After(t2.Time) } +// logarithmicRankDiff calculates the base-2 logarithmic ranks of 2 timestamps, +// compared to the current timestamp +func logarithmicRankDiff(t1, t2, now metav1.Time) int64 { + d1 := now.Sub(t1.Time) + d2 := now.Sub(t2.Time) + r1 := int64(-1) + r2 := int64(-1) + if d1 > 0 { + r1 = int64(math.Log2(float64(d1))) + } + if d2 > 0 { + r2 = int64(math.Log2(float64(d2))) + } + return r1 - r2 +} + func podReadyTime(pod *v1.Pod) *metav1.Time { if podutil.IsPodReady(pod) { for _, c := range pod.Status.Conditions { diff --git a/pkg/controller/controller_utils_test.go b/pkg/controller/controller_utils_test.go index 4f0d563644e..e5139c60d3f 100644 --- a/pkg/controller/controller_utils_test.go +++ b/pkg/controller/controller_utils_test.go @@ -437,7 +437,10 @@ func TestSortingActivePods(t *testing.T) { func TestSortingActivePodsWithRanks(t *testing.T) { now := metav1.Now() - then := metav1.Time{Time: now.AddDate(0, -1, 0)} + then1Month := metav1.Time{Time: now.AddDate(0, -1, 0)} + then2Hours := metav1.Time{Time: now.Add(-2 * time.Hour)} + then5Hours := metav1.Time{Time: now.Add(-5 * time.Hour)} + then8Hours := metav1.Time{Time: now.Add(-8 * time.Hour)} zeroTime := metav1.Time{} pod := func(podName, nodeName string, phase v1.PodPhase, ready bool, restarts int32, readySince metav1.Time, created metav1.Time, annotations map[string]string) *v1.Pod { var conditions []v1.PodCondition @@ -467,30 +470,46 @@ func TestSortingActivePodsWithRanks(t *testing.T) { runningNotReadyPod = pod("not-ready", "node", v1.PodRunning, false, 0, zeroTime, zeroTime, nil) runningReadyNoLastTransitionTimePod = pod("ready-no-last-transition-time", "node", v1.PodRunning, true, 0, zeroTime, zeroTime, nil) runningReadyNow = pod("ready-now", "node", v1.PodRunning, true, 0, now, now, nil) - runningReadyThen = pod("ready-then", "node", v1.PodRunning, true, 0, then, then, nil) + runningReadyThen = pod("ready-then", "node", v1.PodRunning, true, 0, then1Month, then1Month, nil) runningReadyNowHighRestarts = pod("ready-high-restarts", "node", v1.PodRunning, true, 9001, now, now, nil) - runningReadyNowCreatedThen = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then, nil) - lowPodDeletionCost = pod("low-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{core.PodDeletionCost: "10"}) - highPodDeletionCost = pod("high-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{core.PodDeletionCost: "100"}) + runningReadyNowCreatedThen = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then1Month, nil) + lowPodDeletionCost = pod("low-deletion-cost", "node", v1.PodRunning, true, 0, now, then1Month, map[string]string{core.PodDeletionCost: "10"}) + highPodDeletionCost = pod("high-deletion-cost", "node", v1.PodRunning, true, 0, now, then1Month, map[string]string{core.PodDeletionCost: "100"}) + unscheduled5Hours = pod("unscheduled-5-hours", "", v1.PodPending, false, 0, then5Hours, then5Hours, nil) + unscheduled8Hours = pod("unscheduled-10-hours", "", v1.PodPending, false, 0, then8Hours, then8Hours, nil) + ready2Hours = pod("ready-2-hours", "", v1.PodRunning, true, 0, then2Hours, then1Month, nil) + ready5Hours = pod("ready-5-hours", "", v1.PodRunning, true, 0, then5Hours, then1Month, nil) + ready10Hours = pod("ready-10-hours", "", v1.PodRunning, true, 0, then8Hours, then1Month, nil) ) - equalityTests := []*v1.Pod{ - unscheduledPod, - scheduledPendingPod, - unknownPhasePod, - runningNotReadyPod, - runningReadyNowCreatedThen, - runningReadyNow, - runningReadyThen, - runningReadyNowHighRestarts, - runningReadyNowCreatedThen, + equalityTests := []struct { + p1 *v1.Pod + p2 *v1.Pod + disableLogarithmicScaleDown bool + }{ + {p1: unscheduledPod}, + {p1: scheduledPendingPod}, + {p1: unknownPhasePod}, + {p1: runningNotReadyPod}, + {p1: runningReadyNowCreatedThen}, + {p1: runningReadyNow}, + {p1: runningReadyThen}, + {p1: runningReadyNowHighRestarts}, + {p1: runningReadyNowCreatedThen}, + {p1: unscheduled5Hours, p2: unscheduled8Hours}, + {p1: ready5Hours, p2: ready10Hours}, } - for _, pod := range equalityTests { + for _, tc := range equalityTests { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LogarithmicScaleDown, !tc.disableLogarithmicScaleDown)() + if tc.p2 == nil { + tc.p2 = tc.p1 + } podsWithRanks := ActivePodsWithRanks{ - Pods: []*v1.Pod{pod, pod}, + Pods: []*v1.Pod{tc.p1, tc.p2}, Rank: []int{1, 1}, + Now: now, } if podsWithRanks.Less(0, 1) || podsWithRanks.Less(1, 0) { - t.Errorf("expected pod %q not to be less than than itself", pod.Name) + t.Errorf("expected pod %q to be equivalent to %q", tc.p1.Name, tc.p2.Name) } } type podWithRank struct { @@ -498,8 +517,9 @@ func TestSortingActivePodsWithRanks(t *testing.T) { rank int } inequalityTests := []struct { - lesser, greater podWithRank - disablePodDeletioncost bool + lesser, greater podWithRank + disablePodDeletioncost bool + disableLogarithmicScaleDown bool }{ {lesser: podWithRank{unscheduledPod, 1}, greater: podWithRank{scheduledPendingPod, 2}}, {lesser: podWithRank{unscheduledPod, 2}, greater: podWithRank{scheduledPendingPod, 1}}, @@ -516,14 +536,17 @@ func TestSortingActivePodsWithRanks(t *testing.T) { {lesser: podWithRank{runningReadyNowCreatedThen, 2}, greater: podWithRank{runningReadyNow, 1}}, {lesser: podWithRank{lowPodDeletionCost, 2}, greater: podWithRank{highPodDeletionCost, 1}}, {lesser: podWithRank{highPodDeletionCost, 2}, greater: podWithRank{lowPodDeletionCost, 1}, disablePodDeletioncost: true}, + {lesser: podWithRank{ready2Hours, 1}, greater: podWithRank{ready5Hours, 1}}, } for i, test := range inequalityTests { t.Run(fmt.Sprintf("test%d", i), func(t *testing.T) { defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodDeletionCost, !test.disablePodDeletioncost)() + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LogarithmicScaleDown, !test.disableLogarithmicScaleDown)() podsWithRanks := ActivePodsWithRanks{ Pods: []*v1.Pod{test.lesser.pod, test.greater.pod}, Rank: []int{test.lesser.rank, test.greater.rank}, + Now: now, } if !podsWithRanks.Less(0, 1) { t.Errorf("expected pod %q with rank %v to be less than %q with rank %v", podsWithRanks.Pods[0].Name, podsWithRanks.Rank[0], podsWithRanks.Pods[1].Name, podsWithRanks.Rank[1]) diff --git a/pkg/controller/replicaset/replica_set.go b/pkg/controller/replicaset/replica_set.go index f52eab6ca92..cffa2df303e 100644 --- a/pkg/controller/replicaset/replica_set.go +++ b/pkg/controller/replicaset/replica_set.go @@ -822,7 +822,7 @@ func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) con for i, pod := range podsToRank { ranks[i] = podsOnNode[pod.Spec.NodeName] } - return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks} + return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks, Now: metav1.Now()} } func getPodKeys(pods []*v1.Pod) []string { diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 3f43794863f..1aa4a3e6ea8 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -692,6 +692,12 @@ const ( // // Enable support multiple Service "type: LoadBalancer" implementations in a cluster by specifying LoadBalancerClass ServiceLoadBalancerClass featuregate.Feature = "ServiceLoadBalancerClass" + + // owner: @damemi + // aplpha: v1.21 + // + // Enables scaling down replicas via logarithmic comparison of creation/ready timestamps + LogarithmicScaleDown featuregate.Feature = "LogarithmicScaleDown" ) func init() { @@ -797,6 +803,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS PodDeletionCost: {Default: false, PreRelease: featuregate.Alpha}, PodAffinityNamespaceSelector: {Default: false, PreRelease: featuregate.Alpha}, ServiceLoadBalancerClass: {Default: false, PreRelease: featuregate.Alpha}, + LogarithmicScaleDown: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/test/integration/replicationcontroller/replicationcontroller_test.go b/test/integration/replicationcontroller/replicationcontroller_test.go index 90d7a09ec1f..99cda73552c 100644 --- a/test/integration/replicationcontroller/replicationcontroller_test.go +++ b/test/integration/replicationcontroller/replicationcontroller_test.go @@ -25,19 +25,23 @@ import ( "time" "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/wait" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" clientset "k8s.io/client-go/kubernetes" typedv1 "k8s.io/client-go/kubernetes/typed/core/v1" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/retry" + featuregatetesting "k8s.io/component-base/featuregate/testing" podutil "k8s.io/kubernetes/pkg/api/v1/pod" "k8s.io/kubernetes/pkg/controller/replication" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/test/integration/framework" ) @@ -491,6 +495,44 @@ func TestSpecReplicasChange(t *testing.T) { } } +func TestLogarithmicScaleDown(t *testing.T) { + defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LogarithmicScaleDown, true)() + s, closeFn, rm, informers, c := rmSetup(t) + defer closeFn() + ns := framework.CreateTestingNamespace("test-spec-replicas-change", s, t) + defer framework.DeleteTestingNamespace(ns, s, t) + stopCh := runControllerAndInformers(t, rm, informers, 0) + defer close(stopCh) + + rc := newRC("rc", ns.Name, 2) + rcs, _ := createRCsPods(t, c, []*v1.ReplicationController{rc}, []*v1.Pod{}) + rc = rcs[0] + waitRCStable(t, c, rc) + + // get list of pods in the cluster + pods, err := c.CoreV1().Pods(ns.Name).List(context.TODO(), metav1.ListOptions{}) + if err != nil { + t.Fatalf("failed to get pods in namespace %s: %+v", ns.Name, err) + } + + // Wait 10 seconds and scale up, the new pod should be in a new logarithmic rank from the first 2 + time.Sleep(10 * time.Second) + scaleRC(t, c, rc, 3) + + // scale back down, and confirm that the pods left in the namespace are the original ones + // (meaning the 3rd one was deleted) + scaleRC(t, c, rc, 2) + + newPods, err := c.CoreV1().Pods(ns.Name).List(context.TODO(), metav1.ListOptions{}) + if err != nil { + t.Fatalf("failed to get pods in namespace %s: %+v", ns.Name, err) + } + + if !apiequality.Semantic.DeepEqual(pods.Items, newPods.Items) { + t.Fatalf("expected pods %+v, got %+v", pods.Items, newPods.Items) + } +} + func TestDeletingAndFailedPods(t *testing.T) { s, closeFn, rm, informers, c := rmSetup(t) defer closeFn()