Merge pull request #99212 from damemi/alculquicondor-log-timestamp

Logarithmic timestamp comparison for downscaling
2025-07-21 10:51:29 +00:00 · 2021-03-06 09:47:41 -08:00 · 2021-03-06 09:47:41 -08:00 · bf448a1eaa
commit bf448a1eaa
parent 377ed3c2a6 a8d105ab72
5 changed files with 142 additions and 23 deletions
--- a/pkg/controller/controller_utils.go
+++ b/pkg/controller/controller_utils.go
@ -22,6 +22,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"hash/fnv"
+	"math"
 	"sync"
 	"sync/atomic"
 	"time"
@ -807,6 +808,10 @@ func (s ActivePods) Less(i, j int) bool {
 // 8. If the pods' creation times differ, the pod that was created more recently
 //    comes before the older pod.
 //
+// In 6 and 8, times are compared in a logarithmic scale. This allows a level
+// of randomness among equivalent Pods when sorting. If two pods have the same
+// logarithmic rank, they are sorted by UUID to provide a pseudorandom order.
+//
 // If none of these rules matches, the second pod comes before the first pod.
 //
 // The intention of this ordering is to put pods that should be preferred for
@ -819,6 +824,10 @@ type ActivePodsWithRanks struct {
 	// comparing two pods that are both scheduled, in the same phase, and
 	// having the same ready status.
 	Rank []int
+
+	// Now is a reference timestamp for doing logarithmic timestamp comparisons.
+	// If zero, comparison happens without scaling.
+	Now metav1.Time
 }

 func (s ActivePodsWithRanks) Len() int {
@ -872,7 +881,18 @@ func (s ActivePodsWithRanks) Less(i, j int) bool {
 		readyTime1 := podReadyTime(s.Pods[i])
 		readyTime2 := podReadyTime(s.Pods[j])
 		if !readyTime1.Equal(readyTime2) {
-			return afterOrZero(readyTime1, readyTime2)
+			if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
+				return afterOrZero(readyTime1, readyTime2)
+			} else {
+				if s.Now.IsZero() || readyTime1.IsZero() || readyTime2.IsZero() {
+					return afterOrZero(readyTime1, readyTime2)
+				}
+				rankDiff := logarithmicRankDiff(*readyTime1, *readyTime2, s.Now)
+				if rankDiff == 0 {
+					return s.Pods[i].UID < s.Pods[j].UID
+				}
+				return rankDiff < 0
+			}
 		}
 	}
 	// 7. Pods with containers with higher restart counts < lower restart counts
@ -881,7 +901,18 @@ func (s ActivePodsWithRanks) Less(i, j int) bool {
 	}
 	// 8. Empty creation time pods < newer pods < older pods
 	if !s.Pods[i].CreationTimestamp.Equal(&s.Pods[j].CreationTimestamp) {
-		return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
+		if !utilfeature.DefaultFeatureGate.Enabled(features.LogarithmicScaleDown) {
+			return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
+		} else {
+			if s.Now.IsZero() || s.Pods[i].CreationTimestamp.IsZero() || s.Pods[j].CreationTimestamp.IsZero() {
+				return afterOrZero(&s.Pods[i].CreationTimestamp, &s.Pods[j].CreationTimestamp)
+			}
+			rankDiff := logarithmicRankDiff(s.Pods[i].CreationTimestamp, s.Pods[j].CreationTimestamp, s.Now)
+			if rankDiff == 0 {
+				return s.Pods[i].UID < s.Pods[j].UID
+			}
+			return rankDiff < 0
+		}
 	}
 	return false
 }
@ -895,6 +926,22 @@ func afterOrZero(t1, t2 *metav1.Time) bool {
 	return t1.After(t2.Time)
 }

+// logarithmicRankDiff calculates the base-2 logarithmic ranks of 2 timestamps,
+// compared to the current timestamp
+func logarithmicRankDiff(t1, t2, now metav1.Time) int64 {
+	d1 := now.Sub(t1.Time)
+	d2 := now.Sub(t2.Time)
+	r1 := int64(-1)
+	r2 := int64(-1)
+	if d1 > 0 {
+		r1 = int64(math.Log2(float64(d1)))
+	}
+	if d2 > 0 {
+		r2 = int64(math.Log2(float64(d2)))
+	}
+	return r1 - r2
+}
+
 func podReadyTime(pod *v1.Pod) *metav1.Time {
 	if podutil.IsPodReady(pod) {
 		for _, c := range pod.Status.Conditions {
--- a/pkg/controller/controller_utils_test.go
+++ b/pkg/controller/controller_utils_test.go
@ -437,7 +437,10 @@ func TestSortingActivePods(t *testing.T) {

 func TestSortingActivePodsWithRanks(t *testing.T) {
 	now := metav1.Now()
-	then := metav1.Time{Time: now.AddDate(0, -1, 0)}
+	then1Month := metav1.Time{Time: now.AddDate(0, -1, 0)}
+	then2Hours := metav1.Time{Time: now.Add(-2 * time.Hour)}
+	then5Hours := metav1.Time{Time: now.Add(-5 * time.Hour)}
+	then8Hours := metav1.Time{Time: now.Add(-8 * time.Hour)}
 	zeroTime := metav1.Time{}
 	pod := func(podName, nodeName string, phase v1.PodPhase, ready bool, restarts int32, readySince metav1.Time, created metav1.Time, annotations map[string]string) *v1.Pod {
 		var conditions []v1.PodCondition
@ -467,30 +470,46 @@ func TestSortingActivePodsWithRanks(t *testing.T) {
 		runningNotReadyPod                  = pod("not-ready", "node", v1.PodRunning, false, 0, zeroTime, zeroTime, nil)
 		runningReadyNoLastTransitionTimePod = pod("ready-no-last-transition-time", "node", v1.PodRunning, true, 0, zeroTime, zeroTime, nil)
 		runningReadyNow                     = pod("ready-now", "node", v1.PodRunning, true, 0, now, now, nil)
-		runningReadyThen                    = pod("ready-then", "node", v1.PodRunning, true, 0, then, then, nil)
+		runningReadyThen                    = pod("ready-then", "node", v1.PodRunning, true, 0, then1Month, then1Month, nil)
 		runningReadyNowHighRestarts         = pod("ready-high-restarts", "node", v1.PodRunning, true, 9001, now, now, nil)
-		runningReadyNowCreatedThen          = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then, nil)
-		lowPodDeletionCost                  = pod("low-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{core.PodDeletionCost: "10"})
-		highPodDeletionCost                 = pod("high-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{core.PodDeletionCost: "100"})
+		runningReadyNowCreatedThen          = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then1Month, nil)
+		lowPodDeletionCost                  = pod("low-deletion-cost", "node", v1.PodRunning, true, 0, now, then1Month, map[string]string{core.PodDeletionCost: "10"})
+		highPodDeletionCost                 = pod("high-deletion-cost", "node", v1.PodRunning, true, 0, now, then1Month, map[string]string{core.PodDeletionCost: "100"})
+		unscheduled5Hours                   = pod("unscheduled-5-hours", "", v1.PodPending, false, 0, then5Hours, then5Hours, nil)
+		unscheduled8Hours                   = pod("unscheduled-10-hours", "", v1.PodPending, false, 0, then8Hours, then8Hours, nil)
+		ready2Hours                         = pod("ready-2-hours", "", v1.PodRunning, true, 0, then2Hours, then1Month, nil)
+		ready5Hours                         = pod("ready-5-hours", "", v1.PodRunning, true, 0, then5Hours, then1Month, nil)
+		ready10Hours                        = pod("ready-10-hours", "", v1.PodRunning, true, 0, then8Hours, then1Month, nil)
 	)
-	equalityTests := []*v1.Pod{
-		unscheduledPod,
-		scheduledPendingPod,
-		unknownPhasePod,
-		runningNotReadyPod,
-		runningReadyNowCreatedThen,
-		runningReadyNow,
-		runningReadyThen,
-		runningReadyNowHighRestarts,
-		runningReadyNowCreatedThen,
+	equalityTests := []struct {
+		p1                          *v1.Pod
+		p2                          *v1.Pod
+		disableLogarithmicScaleDown bool
+	}{
+		{p1: unscheduledPod},
+		{p1: scheduledPendingPod},
+		{p1: unknownPhasePod},
+		{p1: runningNotReadyPod},
+		{p1: runningReadyNowCreatedThen},
+		{p1: runningReadyNow},
+		{p1: runningReadyThen},
+		{p1: runningReadyNowHighRestarts},
+		{p1: runningReadyNowCreatedThen},
+		{p1: unscheduled5Hours, p2: unscheduled8Hours},
+		{p1: ready5Hours, p2: ready10Hours},
 	}
-	for _, pod := range equalityTests {
+	for _, tc := range equalityTests {
+		defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LogarithmicScaleDown, !tc.disableLogarithmicScaleDown)()
+		if tc.p2 == nil {
+			tc.p2 = tc.p1
+		}
 		podsWithRanks := ActivePodsWithRanks{
-			Pods: []*v1.Pod{pod, pod},
+			Pods: []*v1.Pod{tc.p1, tc.p2},
 			Rank: []int{1, 1},
+			Now:  now,
 		}
 		if podsWithRanks.Less(0, 1) || podsWithRanks.Less(1, 0) {
-			t.Errorf("expected pod %q not to be less than than itself", pod.Name)
+			t.Errorf("expected pod %q to be equivalent to %q", tc.p1.Name, tc.p2.Name)
 		}
 	}
 	type podWithRank struct {
@ -498,8 +517,9 @@ func TestSortingActivePodsWithRanks(t *testing.T) {
 		rank int
 	}
 	inequalityTests := []struct {
-		lesser, greater        podWithRank
-		disablePodDeletioncost bool
+		lesser, greater             podWithRank
+		disablePodDeletioncost      bool
+		disableLogarithmicScaleDown bool
 	}{
 		{lesser: podWithRank{unscheduledPod, 1}, greater: podWithRank{scheduledPendingPod, 2}},
 		{lesser: podWithRank{unscheduledPod, 2}, greater: podWithRank{scheduledPendingPod, 1}},
@ -516,14 +536,17 @@ func TestSortingActivePodsWithRanks(t *testing.T) {
 		{lesser: podWithRank{runningReadyNowCreatedThen, 2}, greater: podWithRank{runningReadyNow, 1}},
 		{lesser: podWithRank{lowPodDeletionCost, 2}, greater: podWithRank{highPodDeletionCost, 1}},
 		{lesser: podWithRank{highPodDeletionCost, 2}, greater: podWithRank{lowPodDeletionCost, 1}, disablePodDeletioncost: true},
+		{lesser: podWithRank{ready2Hours, 1}, greater: podWithRank{ready5Hours, 1}},
 	}
 	for i, test := range inequalityTests {
 		t.Run(fmt.Sprintf("test%d", i), func(t *testing.T) {
 			defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.PodDeletionCost, !test.disablePodDeletioncost)()
+			defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LogarithmicScaleDown, !test.disableLogarithmicScaleDown)()

 			podsWithRanks := ActivePodsWithRanks{
 				Pods: []*v1.Pod{test.lesser.pod, test.greater.pod},
 				Rank: []int{test.lesser.rank, test.greater.rank},
+				Now:  now,
 			}
 			if !podsWithRanks.Less(0, 1) {
 				t.Errorf("expected pod %q with rank %v to be less than %q with rank %v", podsWithRanks.Pods[0].Name, podsWithRanks.Rank[0], podsWithRanks.Pods[1].Name, podsWithRanks.Rank[1])
--- a/pkg/controller/replicaset/replica_set.go
+++ b/pkg/controller/replicaset/replica_set.go
@ -822,7 +822,7 @@ func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) con
 	for i, pod := range podsToRank {
 		ranks[i] = podsOnNode[pod.Spec.NodeName]
 	}
-	return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks}
+	return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks, Now: metav1.Now()}
 }

 func getPodKeys(pods []*v1.Pod) []string {
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@ -692,6 +692,12 @@ const (
 	//
 	// Enable support multiple Service "type: LoadBalancer" implementations in a cluster by specifying LoadBalancerClass
 	ServiceLoadBalancerClass featuregate.Feature = "ServiceLoadBalancerClass"
+
+	// owner: @damemi
+	// aplpha: v1.21
+	//
+	// Enables scaling down replicas via logarithmic comparison of creation/ready timestamps
+	LogarithmicScaleDown featuregate.Feature = "LogarithmicScaleDown"
 )

 func init() {
@ -797,6 +803,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
 	PodDeletionCost:                                {Default: false, PreRelease: featuregate.Alpha},
 	PodAffinityNamespaceSelector:                   {Default: false, PreRelease: featuregate.Alpha},
 	ServiceLoadBalancerClass:                       {Default: false, PreRelease: featuregate.Alpha},
+	LogarithmicScaleDown:                           {Default: false, PreRelease: featuregate.Alpha},

 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed
 	// unintentionally on either side:
--- a/test/integration/replicationcontroller/replicationcontroller_test.go
+++ b/test/integration/replicationcontroller/replicationcontroller_test.go
@ -25,19 +25,23 @@ import (
 	"time"

 	"k8s.io/api/core/v1"
+	apiequality "k8s.io/apimachinery/pkg/api/equality"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/wait"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	"k8s.io/client-go/informers"
 	clientset "k8s.io/client-go/kubernetes"
 	typedv1 "k8s.io/client-go/kubernetes/typed/core/v1"
 	restclient "k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/cache"
 	"k8s.io/client-go/util/retry"
+	featuregatetesting "k8s.io/component-base/featuregate/testing"
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	"k8s.io/kubernetes/pkg/controller/replication"
+	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/test/integration/framework"
 )

@ -491,6 +495,44 @@ func TestSpecReplicasChange(t *testing.T) {
 	}
 }

+func TestLogarithmicScaleDown(t *testing.T) {
+	defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LogarithmicScaleDown, true)()
+	s, closeFn, rm, informers, c := rmSetup(t)
+	defer closeFn()
+	ns := framework.CreateTestingNamespace("test-spec-replicas-change", s, t)
+	defer framework.DeleteTestingNamespace(ns, s, t)
+	stopCh := runControllerAndInformers(t, rm, informers, 0)
+	defer close(stopCh)
+
+	rc := newRC("rc", ns.Name, 2)
+	rcs, _ := createRCsPods(t, c, []*v1.ReplicationController{rc}, []*v1.Pod{})
+	rc = rcs[0]
+	waitRCStable(t, c, rc)
+
+	// get list of pods in the cluster
+	pods, err := c.CoreV1().Pods(ns.Name).List(context.TODO(), metav1.ListOptions{})
+	if err != nil {
+		t.Fatalf("failed to get pods in namespace %s: %+v", ns.Name, err)
+	}
+
+	// Wait 10 seconds and scale up, the new pod should be in a new logarithmic rank from the first 2
+	time.Sleep(10 * time.Second)
+	scaleRC(t, c, rc, 3)
+
+	// scale back down, and confirm that the pods left in the namespace are the original ones
+	// (meaning the 3rd one was deleted)
+	scaleRC(t, c, rc, 2)
+
+	newPods, err := c.CoreV1().Pods(ns.Name).List(context.TODO(), metav1.ListOptions{})
+	if err != nil {
+		t.Fatalf("failed to get pods in namespace %s: %+v", ns.Name, err)
+	}
+
+	if !apiequality.Semantic.DeepEqual(pods.Items, newPods.Items) {
+		t.Fatalf("expected pods %+v, got %+v", pods.Items, newPods.Items)
+	}
+}
+
 func TestDeletingAndFailedPods(t *testing.T) {
 	s, closeFn, rm, informers, c := rmSetup(t)
 	defer closeFn()