Merge pull request #130416 from macsko/add_backoff_expiration

Store Pod backoff expiration time in QueuedPodInfo
This commit is contained in:
Kubernetes Prow Robot 2025-03-06 05:09:45 -08:00 committed by GitHub
commit 7c78041218
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 15 additions and 7 deletions

View File

@ -20,6 +20,7 @@ import (
"container/list"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
@ -252,6 +253,7 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo
return nil, err
}
pInfo.Attempts++
pInfo.BackoffExpiration = time.Time{}
// In flight, no concurrent events yet.
if aq.isSchedulingQueueHintEnabled {
// If the pod is already in the map, we shouldn't overwrite the inFlightPods otherwise it'd lead to a memory leak.

View File

@ -32,8 +32,6 @@ type backoffQueuer interface {
// isPodBackingoff returns true if a pod is still waiting for its backoff timer.
// If this returns true, the pod should not be re-tried.
isPodBackingoff(podInfo *framework.QueuedPodInfo) bool
// getBackoffTime returns the time that podInfo completes backoff
getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time
// popEachBackoffCompleted run fn for all pods from podBackoffQ and podErrorBackoffQ that completed backoff while popping them.
popEachBackoffCompleted(logger klog.Logger, fn func(pInfo *framework.QueuedPodInfo))
@ -113,11 +111,17 @@ func (bq *backoffQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
return boTime.After(bq.clock.Now())
}
// getBackoffTime returns the time that podInfo completes backoff
// getBackoffTime returns the time that podInfo completes backoff.
// It caches the result in podInfo.BackoffExpiration and returns this value in subsequent calls.
// The cache will be cleared when this pod is poped from the scheduling queue again (i.e., at activeQ's pop),
// because of the fact that the backoff time is calculated based on podInfo.Attempts,
// which doesn't get changed until the pod's scheduling is retried.
func (bq *backoffQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
duration := bq.calculateBackoffDuration(podInfo)
backoffTime := podInfo.Timestamp.Add(duration)
return backoffTime
if podInfo.BackoffExpiration.IsZero() {
duration := bq.calculateBackoffDuration(podInfo)
podInfo.BackoffExpiration = podInfo.Timestamp.Add(duration)
}
return podInfo.BackoffExpiration
}
// calculateBackoffDuration is a helper function for calculating the backoffDuration

View File

@ -3640,7 +3640,7 @@ func TestBackOffFlow(t *testing.T) {
}
// Check backoff duration.
deadline := q.backoffQ.getBackoffTime(podInfo)
deadline := podInfo.BackoffExpiration
backoff := deadline.Sub(timestamp)
if backoff != step.wantBackoff {
t.Errorf("got backoff %s, want %s", backoff, step.wantBackoff)

View File

@ -366,6 +366,8 @@ type QueuedPodInfo struct {
// Number of schedule attempts before successfully scheduled.
// It's used to record the # attempts metric and calculate the backoff time this Pod is obliged to get before retrying.
Attempts int
// BackoffExpiration is the time when the Pod will complete its backoff.
BackoffExpiration time.Time
// The time when the pod is added to the queue for the first time. The pod may be added
// back to the queue multiple times before it's successfully scheduled.
// It shouldn't be updated once initialized. It's used to record the e2e scheduling