mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-08 11:38:15 +00:00
Job controller implementation of backoff limit per index (#118009)
This commit is contained in:
parent
f55f2785e2
commit
a15c27661e
@ -23,6 +23,7 @@ import (
|
|||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
"k8s.io/client-go/tools/cache"
|
"k8s.io/client-go/tools/cache"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
|
||||||
"k8s.io/utils/clock"
|
"k8s.io/utils/clock"
|
||||||
"k8s.io/utils/pointer"
|
"k8s.io/utils/pointer"
|
||||||
@ -213,12 +214,31 @@ func getFinishTimeFromDeletionTimestamp(p *v1.Pod) *time.Time {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration {
|
func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration) time.Duration {
|
||||||
if backoff.failuresAfterLastSuccess == 0 {
|
return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, backoff.failuresAfterLastSuccess, backoff.lastFailureTime)
|
||||||
|
}
|
||||||
|
|
||||||
|
// getRemainingTimePerIndex returns the remaining time left for a given index to
|
||||||
|
// create the replacement pods. The number of consecutive pod failures for the
|
||||||
|
// index is retrieved from the `job-index-failure-count` annotation of the
|
||||||
|
// last failed pod within the index (represented by `lastFailedPod`).
|
||||||
|
// The last failed pod is also used to determine the time of the last failure.
|
||||||
|
func getRemainingTimePerIndex(logger klog.Logger, clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, lastFailedPod *v1.Pod) time.Duration {
|
||||||
|
if lastFailedPod == nil {
|
||||||
|
// There is no previous failed pod for this index
|
||||||
|
return time.Duration(0)
|
||||||
|
}
|
||||||
|
failureCount := getIndexAbsoluteFailureCount(logger, lastFailedPod) + 1
|
||||||
|
lastFailureTime := getFinishedTime(lastFailedPod)
|
||||||
|
return getRemainingTimeForFailuresCount(clock, defaultBackoff, maxBackoff, failureCount, &lastFailureTime)
|
||||||
|
}
|
||||||
|
|
||||||
|
func getRemainingTimeForFailuresCount(clock clock.WithTicker, defaultBackoff time.Duration, maxBackoff time.Duration, failuresCount int32, lastFailureTime *time.Time) time.Duration {
|
||||||
|
if failuresCount == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
backoffDuration := defaultBackoff
|
backoffDuration := defaultBackoff
|
||||||
for i := 1; i < int(backoff.failuresAfterLastSuccess); i++ {
|
for i := 1; i < int(failuresCount); i++ {
|
||||||
backoffDuration = backoffDuration * 2
|
backoffDuration = backoffDuration * 2
|
||||||
if backoffDuration >= maxBackoff {
|
if backoffDuration >= maxBackoff {
|
||||||
backoffDuration = maxBackoff
|
backoffDuration = maxBackoff
|
||||||
@ -226,7 +246,7 @@ func (backoff backoffRecord) getRemainingTime(clock clock.WithTicker, defaultBac
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
timeElapsedSinceLastFailure := clock.Since(*backoff.lastFailureTime)
|
timeElapsedSinceLastFailure := clock.Since(*lastFailureTime)
|
||||||
|
|
||||||
if backoffDuration < timeElapsedSinceLastFailure {
|
if backoffDuration < timeElapsedSinceLastFailure {
|
||||||
return 0
|
return 0
|
||||||
|
@ -23,6 +23,7 @@ import (
|
|||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/klog/v2/ktesting"
|
||||||
clocktesting "k8s.io/utils/clock/testing"
|
clocktesting "k8s.io/utils/clock/testing"
|
||||||
"k8s.io/utils/pointer"
|
"k8s.io/utils/pointer"
|
||||||
)
|
)
|
||||||
@ -466,3 +467,46 @@ func TestGetRemainingBackoffTime(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetRemainingBackoffTimePerIndex(t *testing.T) {
|
||||||
|
defaultTestTime := metav1.NewTime(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC))
|
||||||
|
testCases := map[string]struct {
|
||||||
|
currentTime time.Time
|
||||||
|
maxBackoff time.Duration
|
||||||
|
defaultBackoff time.Duration
|
||||||
|
lastFailedPod *v1.Pod
|
||||||
|
wantDuration time.Duration
|
||||||
|
}{
|
||||||
|
"no failures": {
|
||||||
|
lastFailedPod: nil,
|
||||||
|
defaultBackoff: 5 * time.Second,
|
||||||
|
maxBackoff: 700 * time.Second,
|
||||||
|
wantDuration: 0 * time.Second,
|
||||||
|
},
|
||||||
|
"two prev failures; current time and failure time are same": {
|
||||||
|
lastFailedPod: buildPod().phase(v1.PodFailed).indexFailureCount("2").customDeletionTimestamp(defaultTestTime.Time).Pod,
|
||||||
|
currentTime: defaultTestTime.Time,
|
||||||
|
defaultBackoff: 5 * time.Second,
|
||||||
|
maxBackoff: 700 * time.Second,
|
||||||
|
wantDuration: 20 * time.Second,
|
||||||
|
},
|
||||||
|
"one prev failure counted and one ignored; current time and failure time are same": {
|
||||||
|
lastFailedPod: buildPod().phase(v1.PodFailed).indexFailureCount("1").indexIgnoredFailureCount("1").customDeletionTimestamp(defaultTestTime.Time).Pod,
|
||||||
|
currentTime: defaultTestTime.Time,
|
||||||
|
defaultBackoff: 5 * time.Second,
|
||||||
|
maxBackoff: 700 * time.Second,
|
||||||
|
wantDuration: 20 * time.Second,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, tc := range testCases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
fakeClock := clocktesting.NewFakeClock(tc.currentTime.Truncate(time.Second))
|
||||||
|
d := getRemainingTimePerIndex(logger, fakeClock, tc.defaultBackoff, tc.maxBackoff, tc.lastFailedPod)
|
||||||
|
if d.Seconds() != tc.wantDuration.Seconds() {
|
||||||
|
t.Errorf("Expected value of duration %v; got %v", tc.wantDuration, d)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -18,6 +18,7 @@ package job
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -41,6 +42,10 @@ func isIndexedJob(job *batch.Job) bool {
|
|||||||
return job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion
|
return job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func hasBackoffLimitPerIndex(job *batch.Job) bool {
|
||||||
|
return feature.DefaultFeatureGate.Enabled(features.JobBackoffLimitPerIndex) && job.Spec.BackoffLimitPerIndex != nil
|
||||||
|
}
|
||||||
|
|
||||||
type interval struct {
|
type interval struct {
|
||||||
First int
|
First int
|
||||||
Last int
|
Last int
|
||||||
@ -54,7 +59,7 @@ type orderedIntervals []interval
|
|||||||
// empty list if this Job is not tracked with finalizers. The new list includes
|
// empty list if this Job is not tracked with finalizers. The new list includes
|
||||||
// the indexes that succeeded since the last sync.
|
// the indexes that succeeded since the last sync.
|
||||||
func calculateSucceededIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) {
|
func calculateSucceededIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) {
|
||||||
prevIntervals := succeededIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions))
|
prevIntervals := parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions))
|
||||||
newSucceeded := sets.New[int]()
|
newSucceeded := sets.New[int]()
|
||||||
for _, p := range pods {
|
for _, p := range pods {
|
||||||
ix := getCompletionIndex(p.Annotations)
|
ix := getCompletionIndex(p.Annotations)
|
||||||
@ -69,9 +74,55 @@ func calculateSucceededIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Po
|
|||||||
return prevIntervals, result
|
return prevIntervals, result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// calculateFailedIndexes returns the list of failed indexes in compressed
|
||||||
|
// format (intervals). The list includes indexes already present in
|
||||||
|
// .status.failedIndexes and indexes that failed since the last sync.
|
||||||
|
func calculateFailedIndexes(logger klog.Logger, job *batch.Job, pods []*v1.Pod) *orderedIntervals {
|
||||||
|
var prevIntervals orderedIntervals
|
||||||
|
if job.Status.FailedIndexes != nil {
|
||||||
|
prevIntervals = parseIndexesFromString(logger, *job.Status.FailedIndexes, int(*job.Spec.Completions))
|
||||||
|
}
|
||||||
|
newFailed := sets.New[int]()
|
||||||
|
for _, p := range pods {
|
||||||
|
ix := getCompletionIndex(p.Annotations)
|
||||||
|
// Failed Pod with valid index and has a finalizer (meaning that it is not counted yet).
|
||||||
|
if ix != unknownCompletionIndex && ix < int(*job.Spec.Completions) && hasJobTrackingFinalizer(p) && isIndexFailed(logger, job, p) {
|
||||||
|
newFailed.Insert(ix)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// List returns the items of the set in order.
|
||||||
|
result := prevIntervals.withOrderedIndexes(sets.List(newFailed))
|
||||||
|
return &result
|
||||||
|
}
|
||||||
|
|
||||||
|
func isIndexFailed(logger klog.Logger, job *batch.Job, pod *v1.Pod) bool {
|
||||||
|
isPodFailedCounted := false
|
||||||
|
if isPodFailed(pod, job) {
|
||||||
|
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
|
||||||
|
_, countFailed, action := matchPodFailurePolicy(job.Spec.PodFailurePolicy, pod)
|
||||||
|
if action != nil && *action == batch.PodFailurePolicyActionFailIndex {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
isPodFailedCounted = countFailed
|
||||||
|
} else {
|
||||||
|
isPodFailedCounted = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return isPodFailedCounted && getIndexFailureCount(logger, pod) >= *job.Spec.BackoffLimitPerIndex
|
||||||
|
}
|
||||||
|
|
||||||
// withOrderedIndexes returns a new list of ordered intervals that contains
|
// withOrderedIndexes returns a new list of ordered intervals that contains
|
||||||
// the newIndexes, provided in increasing order.
|
// the newIndexes, provided in increasing order.
|
||||||
func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals {
|
func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals {
|
||||||
|
newIndexIntervals := make(orderedIntervals, len(newIndexes))
|
||||||
|
for i, newIndex := range newIndexes {
|
||||||
|
newIndexIntervals[i] = interval{newIndex, newIndex}
|
||||||
|
}
|
||||||
|
return oi.merge(newIndexIntervals)
|
||||||
|
}
|
||||||
|
|
||||||
|
// with returns a new list of ordered intervals that contains the newOrderedIntervals.
|
||||||
|
func (oi orderedIntervals) merge(newOi orderedIntervals) orderedIntervals {
|
||||||
var result orderedIntervals
|
var result orderedIntervals
|
||||||
i := 0
|
i := 0
|
||||||
j := 0
|
j := 0
|
||||||
@ -84,12 +135,12 @@ func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals
|
|||||||
lastInterval.Last = thisInterval.Last
|
lastInterval.Last = thisInterval.Last
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for i < len(oi) && j < len(newIndexes) {
|
for i < len(oi) && j < len(newOi) {
|
||||||
if oi[i].First < newIndexes[j] {
|
if oi[i].First < newOi[j].First {
|
||||||
appendOrMergeWithLastInterval(oi[i])
|
appendOrMergeWithLastInterval(oi[i])
|
||||||
i++
|
i++
|
||||||
} else {
|
} else {
|
||||||
appendOrMergeWithLastInterval(interval{newIndexes[j], newIndexes[j]})
|
appendOrMergeWithLastInterval(newOi[j])
|
||||||
j++
|
j++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -97,8 +148,8 @@ func (oi orderedIntervals) withOrderedIndexes(newIndexes []int) orderedIntervals
|
|||||||
appendOrMergeWithLastInterval(oi[i])
|
appendOrMergeWithLastInterval(oi[i])
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
for j < len(newIndexes) {
|
for j < len(newOi) {
|
||||||
appendOrMergeWithLastInterval(interval{newIndexes[j], newIndexes[j]})
|
appendOrMergeWithLastInterval(newOi[j])
|
||||||
j++
|
j++
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
@ -150,19 +201,19 @@ func (oi orderedIntervals) has(ix int) bool {
|
|||||||
return oi[hi].First <= ix
|
return oi[hi].First <= ix
|
||||||
}
|
}
|
||||||
|
|
||||||
func succeededIndexesFromString(logger klog.Logger, completedIndexes string, completions int) orderedIntervals {
|
func parseIndexesFromString(logger klog.Logger, indexesStr string, completions int) orderedIntervals {
|
||||||
if completedIndexes == "" {
|
if indexesStr == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
var result orderedIntervals
|
var result orderedIntervals
|
||||||
var lastInterval *interval
|
var lastInterval *interval
|
||||||
for _, intervalStr := range strings.Split(completedIndexes, ",") {
|
for _, intervalStr := range strings.Split(indexesStr, ",") {
|
||||||
limitsStr := strings.Split(intervalStr, "-")
|
limitsStr := strings.Split(intervalStr, "-")
|
||||||
var inter interval
|
var inter interval
|
||||||
var err error
|
var err error
|
||||||
inter.First, err = strconv.Atoi(limitsStr[0])
|
inter.First, err = strconv.Atoi(limitsStr[0])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Info("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err)
|
logger.Info("Corrupted indexes interval, ignoring", "interval", intervalStr, "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if inter.First >= completions {
|
if inter.First >= completions {
|
||||||
@ -171,7 +222,7 @@ func succeededIndexesFromString(logger klog.Logger, completedIndexes string, com
|
|||||||
if len(limitsStr) > 1 {
|
if len(limitsStr) > 1 {
|
||||||
inter.Last, err = strconv.Atoi(limitsStr[1])
|
inter.Last, err = strconv.Atoi(limitsStr[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.Info("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err)
|
logger.Info("Corrupted indexes interval, ignoring", "interval", intervalStr, "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if inter.Last >= completions {
|
if inter.Last >= completions {
|
||||||
@ -191,20 +242,17 @@ func succeededIndexesFromString(logger klog.Logger, completedIndexes string, com
|
|||||||
}
|
}
|
||||||
|
|
||||||
// firstPendingIndexes returns `count` indexes less than `completions` that are
|
// firstPendingIndexes returns `count` indexes less than `completions` that are
|
||||||
// not covered by `activePods` or `succeededIndexes`.
|
// not covered by `activePods`, `succeededIndexes` or `failedIndexes`.
|
||||||
func firstPendingIndexes(jobCtx *syncJobCtx, count, completions int) []int {
|
func firstPendingIndexes(jobCtx *syncJobCtx, count, completions int) []int {
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
active := sets.New[int]()
|
active := getIndexes(jobCtx.activePods)
|
||||||
for _, p := range jobCtx.activePods {
|
|
||||||
ix := getCompletionIndex(p.Annotations)
|
|
||||||
if ix != unknownCompletionIndex {
|
|
||||||
active.Insert(ix)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result := make([]int, 0, count)
|
result := make([]int, 0, count)
|
||||||
nonPending := jobCtx.succeededIndexes.withOrderedIndexes(sets.List(active))
|
nonPending := jobCtx.succeededIndexes.withOrderedIndexes(sets.List(active))
|
||||||
|
if jobCtx.failedIndexes != nil {
|
||||||
|
nonPending = nonPending.merge(*jobCtx.failedIndexes)
|
||||||
|
}
|
||||||
// The following algorithm is bounded by len(nonPending) and count.
|
// The following algorithm is bounded by len(nonPending) and count.
|
||||||
candidate := 0
|
candidate := 0
|
||||||
for _, sInterval := range nonPending {
|
for _, sInterval := range nonPending {
|
||||||
@ -221,6 +269,18 @@ func firstPendingIndexes(jobCtx *syncJobCtx, count, completions int) []int {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the list of indexes corresponding to the set of pods
|
||||||
|
func getIndexes(pods []*v1.Pod) sets.Set[int] {
|
||||||
|
result := sets.New[int]()
|
||||||
|
for _, p := range pods {
|
||||||
|
ix := getCompletionIndex(p.Annotations)
|
||||||
|
if ix != unknownCompletionIndex {
|
||||||
|
result.Insert(ix)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
// appendDuplicatedIndexPodsForRemoval scans active `pods` for duplicated
|
// appendDuplicatedIndexPodsForRemoval scans active `pods` for duplicated
|
||||||
// completion indexes. For each index, it selects n-1 pods for removal, where n
|
// completion indexes. For each index, it selects n-1 pods for removal, where n
|
||||||
// is the number of repetitions. The pods to be removed are appended to `rm`,
|
// is the number of repetitions. The pods to be removed are appended to `rm`,
|
||||||
@ -248,6 +308,69 @@ func appendDuplicatedIndexPodsForRemoval(rm, left, pods []*v1.Pod, completions i
|
|||||||
return appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods[firstRepeatPos:countLooped], lastIndex)
|
return appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods[firstRepeatPos:countLooped], lastIndex)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getPodsWithDelayedDeletionPerIndex returns the pod which removal is delayed
|
||||||
|
// in order to await for recreation. This map is used when BackoffLimitPerIndex
|
||||||
|
// is enabled to delay pod finalizer removal, and thus pod deletion, until the
|
||||||
|
// replacement pod is created. The pod deletion is delayed so that the
|
||||||
|
// replacement pod can have the batch.kubernetes.io/job-index-failure-count
|
||||||
|
// annotation set properly keeping track of the number of failed pods within
|
||||||
|
// the index.
|
||||||
|
func getPodsWithDelayedDeletionPerIndex(logger klog.Logger, jobCtx *syncJobCtx) map[int]*v1.Pod {
|
||||||
|
// the failed pods corresponding to currently active indexes can be safely
|
||||||
|
// deleted as the failure count annotation is present in the currently
|
||||||
|
// active pods.
|
||||||
|
activeIndexes := getIndexes(jobCtx.activePods)
|
||||||
|
|
||||||
|
podsWithDelayedDeletionPerIndex := make(map[int]*v1.Pod)
|
||||||
|
getValidPodsWithFilter(jobCtx, nil, func(p *v1.Pod) bool {
|
||||||
|
if isPodFailed(p, jobCtx.job) {
|
||||||
|
if ix := getCompletionIndex(p.Annotations); ix != unknownCompletionIndex && ix < int(*jobCtx.job.Spec.Completions) {
|
||||||
|
if jobCtx.succeededIndexes.has(ix) || jobCtx.failedIndexes.has(ix) || activeIndexes.Has(ix) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if lastPodWithDelayedDeletion, ok := podsWithDelayedDeletionPerIndex[ix]; ok {
|
||||||
|
if getIndexAbsoluteFailureCount(logger, lastPodWithDelayedDeletion) <= getIndexAbsoluteFailureCount(logger, p) && !getFinishedTime(p).Before(getFinishedTime(lastPodWithDelayedDeletion)) {
|
||||||
|
podsWithDelayedDeletionPerIndex[ix] = p
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
podsWithDelayedDeletionPerIndex[ix] = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
return podsWithDelayedDeletionPerIndex
|
||||||
|
}
|
||||||
|
|
||||||
|
func addIndexFailureCountAnnotation(logger klog.Logger, template *v1.PodTemplateSpec, job *batch.Job, podBeingReplaced *v1.Pod) {
|
||||||
|
indexFailureCount, indexIgnoredFailureCount := getNewIndexFailureCounts(logger, job, podBeingReplaced)
|
||||||
|
template.Annotations[batch.JobIndexFailureCountAnnotation] = strconv.Itoa(int(indexFailureCount))
|
||||||
|
if indexIgnoredFailureCount > 0 {
|
||||||
|
template.Annotations[batch.JobIndexIgnoredFailureCountAnnotation] = strconv.Itoa(int(indexIgnoredFailureCount))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// getNewIndexFailureCount returns the value of the index-failure-count
|
||||||
|
// annotation for the new pod being created
|
||||||
|
func getNewIndexFailureCounts(logger klog.Logger, job *batch.Job, podBeingReplaced *v1.Pod) (int32, int32) {
|
||||||
|
if podBeingReplaced != nil {
|
||||||
|
indexFailureCount := parseIndexFailureCountAnnotation(logger, podBeingReplaced)
|
||||||
|
indexIgnoredFailureCount := parseIndexFailureIgnoreCountAnnotation(logger, podBeingReplaced)
|
||||||
|
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && job.Spec.PodFailurePolicy != nil {
|
||||||
|
_, countFailed, _ := matchPodFailurePolicy(job.Spec.PodFailurePolicy, podBeingReplaced)
|
||||||
|
if countFailed {
|
||||||
|
indexFailureCount++
|
||||||
|
} else {
|
||||||
|
indexIgnoredFailureCount++
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
indexFailureCount++
|
||||||
|
}
|
||||||
|
return indexFailureCount, indexIgnoredFailureCount
|
||||||
|
}
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
|
||||||
func appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods []*v1.Pod, ix int) ([]*v1.Pod, []*v1.Pod) {
|
func appendPodsWithSameIndexForRemovalAndRemaining(rm, left, pods []*v1.Pod, ix int) ([]*v1.Pod, []*v1.Pod) {
|
||||||
if ix == unknownCompletionIndex {
|
if ix == unknownCompletionIndex {
|
||||||
rm = append(rm, pods...)
|
rm = append(rm, pods...)
|
||||||
@ -281,6 +404,49 @@ func getCompletionIndex(annotations map[string]string) int {
|
|||||||
return i
|
return i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getIndexFailureCount returns the value of the batch.kubernetes.io/job-index-failure-count
|
||||||
|
// annotation as int32. It fallbacks to 0 when:
|
||||||
|
// - there is no annotation - for example the pod was created when the BackoffLimitPerIndex
|
||||||
|
// feature was temporarily disabled, or the annotation was manually removed by the user,
|
||||||
|
// - the value of the annotation isn't parsable as int - for example because
|
||||||
|
// it was set by a malicious user,
|
||||||
|
// - the value of the annotation is negative or greater by int32 - for example
|
||||||
|
// because it was set by a malicious user.
|
||||||
|
func getIndexFailureCount(logger klog.Logger, pod *v1.Pod) int32 {
|
||||||
|
return parseIndexFailureCountAnnotation(logger, pod)
|
||||||
|
}
|
||||||
|
|
||||||
|
func getIndexAbsoluteFailureCount(logger klog.Logger, pod *v1.Pod) int32 {
|
||||||
|
return parseIndexFailureCountAnnotation(logger, pod) + parseIndexFailureIgnoreCountAnnotation(logger, pod)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseIndexFailureCountAnnotation(logger klog.Logger, pod *v1.Pod) int32 {
|
||||||
|
if value, ok := pod.Annotations[batch.JobIndexFailureCountAnnotation]; ok {
|
||||||
|
return parseInt32(logger, value)
|
||||||
|
}
|
||||||
|
logger.V(3).Info("There is no expected annotation", "annotationKey", batch.JobIndexFailureCountAnnotation, "pod", klog.KObj(pod), "podUID", pod.UID)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseIndexFailureIgnoreCountAnnotation(logger klog.Logger, pod *v1.Pod) int32 {
|
||||||
|
if value, ok := pod.Annotations[batch.JobIndexIgnoredFailureCountAnnotation]; ok {
|
||||||
|
return parseInt32(logger, value)
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseInt32(logger klog.Logger, vStr string) int32 {
|
||||||
|
if vInt, err := strconv.Atoi(vStr); err != nil {
|
||||||
|
logger.Error(err, "Failed to parse the value", "value", vStr)
|
||||||
|
return 0
|
||||||
|
} else if vInt < 0 || vInt > math.MaxInt32 {
|
||||||
|
logger.Info("The value is invalid", "value", vInt)
|
||||||
|
return 0
|
||||||
|
} else {
|
||||||
|
return int32(vInt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func addCompletionIndexEnvVariables(template *v1.PodTemplateSpec) {
|
func addCompletionIndexEnvVariables(template *v1.PodTemplateSpec) {
|
||||||
for i := range template.Spec.InitContainers {
|
for i := range template.Spec.InitContainers {
|
||||||
addCompletionIndexEnvVariable(&template.Spec.InitContainers[i])
|
addCompletionIndexEnvVariable(&template.Spec.InitContainers[i])
|
||||||
|
@ -17,12 +17,20 @@ limitations under the License.
|
|||||||
package job
|
package job
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"math"
|
||||||
|
"strconv"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
batch "k8s.io/api/batch/v1"
|
batch "k8s.io/api/batch/v1"
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
"k8s.io/apiserver/pkg/util/feature"
|
||||||
|
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||||
"k8s.io/klog/v2/ktesting"
|
"k8s.io/klog/v2/ktesting"
|
||||||
|
"k8s.io/kubernetes/pkg/controller"
|
||||||
|
"k8s.io/kubernetes/pkg/features"
|
||||||
"k8s.io/utils/pointer"
|
"k8s.io/utils/pointer"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -219,6 +227,427 @@ func TestCalculateSucceededIndexes(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIsIndexFailed(t *testing.T) {
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
cases := map[string]struct {
|
||||||
|
enableJobPodFailurePolicy bool
|
||||||
|
job batch.Job
|
||||||
|
pod *v1.Pod
|
||||||
|
wantResult bool
|
||||||
|
}{
|
||||||
|
"failed pod exceeding backoffLimitPerIndex, when backoffLimitPerIndex=0": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pod: buildPod().indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
wantResult: true,
|
||||||
|
},
|
||||||
|
"failed pod exceeding backoffLimitPerIndex, when backoffLimitPerIndex=1": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pod: buildPod().indexFailureCount("1").phase(v1.PodFailed).index("1").trackingFinalizer().Pod,
|
||||||
|
wantResult: true,
|
||||||
|
},
|
||||||
|
"matching FailIndex pod failure policy; JobPodFailurePolicy enabled": {
|
||||||
|
enableJobPodFailurePolicy: true,
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pod: buildPod().indexFailureCount("0").status(v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).index("0").trackingFinalizer().Pod,
|
||||||
|
wantResult: true,
|
||||||
|
},
|
||||||
|
"matching FailIndex pod failure policy; JobPodFailurePolicy disabled": {
|
||||||
|
enableJobPodFailurePolicy: false,
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pod: buildPod().indexFailureCount("0").status(v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).index("0").trackingFinalizer().Pod,
|
||||||
|
wantResult: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
|
||||||
|
gotResult := isIndexFailed(logger, &tc.job, tc.pod)
|
||||||
|
if diff := cmp.Diff(tc.wantResult, gotResult); diff != "" {
|
||||||
|
t.Errorf("Unexpected result (-want,+got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCalculateFailedIndexes(t *testing.T) {
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
cases := map[string]struct {
|
||||||
|
enableJobPodFailurePolicy bool
|
||||||
|
job batch.Job
|
||||||
|
pods []*v1.Pod
|
||||||
|
wantPrevFailedIndexes orderedIntervals
|
||||||
|
wantFailedIndexes orderedIntervals
|
||||||
|
}{
|
||||||
|
"one new index failed": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
buildPod().indexFailureCount("1").phase(v1.PodFailed).index("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantFailedIndexes: []interval{{1, 1}},
|
||||||
|
},
|
||||||
|
"pod without finalizer is ignored": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().indexFailureCount("0").phase(v1.PodFailed).index("0").Pod,
|
||||||
|
},
|
||||||
|
wantFailedIndexes: nil,
|
||||||
|
},
|
||||||
|
"pod outside completions is ignored": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().indexFailureCount("0").phase(v1.PodFailed).index("3").Pod,
|
||||||
|
},
|
||||||
|
wantFailedIndexes: nil,
|
||||||
|
},
|
||||||
|
"extend the failed indexes": {
|
||||||
|
job: batch.Job{
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
FailedIndexes: pointer.String("0"),
|
||||||
|
},
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().indexFailureCount("0").phase(v1.PodFailed).index("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantFailedIndexes: []interval{{0, 1}},
|
||||||
|
},
|
||||||
|
"prev failed indexes empty": {
|
||||||
|
job: batch.Job{
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().indexFailureCount("0").phase(v1.PodFailed).index("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantFailedIndexes: []interval{{1, 1}},
|
||||||
|
},
|
||||||
|
"prev failed indexes outside the completions": {
|
||||||
|
job: batch.Job{
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
FailedIndexes: pointer.String("9"),
|
||||||
|
},
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().indexFailureCount("0").phase(v1.PodFailed).index("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantFailedIndexes: []interval{{1, 1}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
failedIndexes := calculateFailedIndexes(logger, &tc.job, tc.pods)
|
||||||
|
if diff := cmp.Diff(&tc.wantFailedIndexes, failedIndexes); diff != "" {
|
||||||
|
t.Errorf("Unexpected failed indexes (-want,+got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetPodsWithDelayedDeletionPerIndex(t *testing.T) {
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
now := time.Now()
|
||||||
|
cases := map[string]struct {
|
||||||
|
enableJobPodFailurePolicy bool
|
||||||
|
job batch.Job
|
||||||
|
pods []*v1.Pod
|
||||||
|
expectedRmFinalizers sets.Set[string]
|
||||||
|
wantPodsWithDelayedDeletionPerIndex []string
|
||||||
|
}{
|
||||||
|
"failed pods are kept corresponding to non-failed indexes are kept": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(3),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("b").indexFailureCount("1").phase(v1.PodFailed).index("1").trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("c").indexFailureCount("0").phase(v1.PodFailed).index("2").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{"a", "c"},
|
||||||
|
},
|
||||||
|
"failed pod without finalizer; the pod's deletion is not delayed as it already started": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{},
|
||||||
|
},
|
||||||
|
"failed pod with expected finalizer removal; the pod's deletion is not delayed as it already started": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
expectedRmFinalizers: sets.New("a"),
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{},
|
||||||
|
},
|
||||||
|
"failed pod with index outside of completions; the pod's deletion is not delayed": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("4").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{},
|
||||||
|
},
|
||||||
|
"failed pod for active index; the pod's deletion is not delayed as it is already replaced": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a1").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("a2").indexFailureCount("1").phase(v1.PodRunning).index("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{},
|
||||||
|
},
|
||||||
|
"failed pod for succeeded index; the pod's deletion is not delayed as it is already replaced": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a1").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("a2").indexFailureCount("1").phase(v1.PodSucceeded).index("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{},
|
||||||
|
},
|
||||||
|
"multiple failed pods for index with different failure count; only the pod with highest failure count is kept": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(4),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a1").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("a3").indexFailureCount("2").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("a2").indexFailureCount("1").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{"a3"},
|
||||||
|
},
|
||||||
|
"multiple failed pods for index with different finish times; only the last failed pod is kept": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(4),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a1").indexFailureCount("1").phase(v1.PodFailed).index("0").customDeletionTimestamp(now.Add(-time.Second)).trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("a3").indexFailureCount("1").phase(v1.PodFailed).index("0").customDeletionTimestamp(now).trackingFinalizer().Pod,
|
||||||
|
buildPod().uid("a2").indexFailureCount("1").phase(v1.PodFailed).index("0").customDeletionTimestamp(now.Add(-2 * time.Second)).trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantPodsWithDelayedDeletionPerIndex: []string{"a3"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
activePods := controller.FilterActivePods(logger, tc.pods)
|
||||||
|
failedIndexes := calculateFailedIndexes(logger, &tc.job, tc.pods)
|
||||||
|
_, succeededIndexes := calculateSucceededIndexes(logger, &tc.job, tc.pods)
|
||||||
|
jobCtx := &syncJobCtx{
|
||||||
|
job: &tc.job,
|
||||||
|
pods: tc.pods,
|
||||||
|
activePods: activePods,
|
||||||
|
succeededIndexes: succeededIndexes,
|
||||||
|
failedIndexes: failedIndexes,
|
||||||
|
expectedRmFinalizers: tc.expectedRmFinalizers,
|
||||||
|
}
|
||||||
|
gotPodsWithDelayedDeletionPerIndex := getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
|
||||||
|
gotPodsWithDelayedDeletionPerIndexSet := sets.New[string]()
|
||||||
|
for _, pod := range gotPodsWithDelayedDeletionPerIndex {
|
||||||
|
gotPodsWithDelayedDeletionPerIndexSet.Insert(string(pod.UID))
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(tc.wantPodsWithDelayedDeletionPerIndex, sets.List(gotPodsWithDelayedDeletionPerIndexSet)); diff != "" {
|
||||||
|
t.Errorf("Unexpected set of pods with delayed deletion (-want,+got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGetNewIndexFailureCountValue(t *testing.T) {
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
cases := map[string]struct {
|
||||||
|
enableJobPodFailurePolicy bool
|
||||||
|
job batch.Job
|
||||||
|
pod *v1.Pod
|
||||||
|
wantNewIndexFailureCount int32
|
||||||
|
wantNewIndexIgnoredFailureCount int32
|
||||||
|
}{
|
||||||
|
"first pod created": {
|
||||||
|
job: batch.Job{},
|
||||||
|
wantNewIndexFailureCount: 0,
|
||||||
|
},
|
||||||
|
"failed pod being replaced with 0 index failure count": {
|
||||||
|
job: batch.Job{},
|
||||||
|
pod: buildPod().uid("a").indexFailureCount("0").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
wantNewIndexFailureCount: 1,
|
||||||
|
},
|
||||||
|
"failed pod being replaced with >0 index failure count": {
|
||||||
|
job: batch.Job{},
|
||||||
|
pod: buildPod().uid("a").indexFailureCount("3").phase(v1.PodFailed).index("0").trackingFinalizer().Pod,
|
||||||
|
wantNewIndexFailureCount: 4,
|
||||||
|
},
|
||||||
|
"failed pod being replaced, matching the ignore rule; JobPodFailurePolicy enabled": {
|
||||||
|
enableJobPodFailurePolicy: true,
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionIgnore,
|
||||||
|
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pod: buildPod().uid("a").indexFailureCount("3").status(v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
Conditions: []v1.PodCondition{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).index("3").trackingFinalizer().Pod,
|
||||||
|
wantNewIndexFailureCount: 3,
|
||||||
|
wantNewIndexIgnoredFailureCount: 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
|
||||||
|
gotNewIndexFailureCount, gotNewIndexIgnoredFailureCount := getNewIndexFailureCounts(logger, &tc.job, tc.pod)
|
||||||
|
if diff := cmp.Diff(tc.wantNewIndexFailureCount, gotNewIndexFailureCount); diff != "" {
|
||||||
|
t.Errorf("Unexpected set of pods with delayed deletion (-want,+got):\n%s", diff)
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(tc.wantNewIndexIgnoredFailureCount, gotNewIndexIgnoredFailureCount); diff != "" {
|
||||||
|
t.Errorf("Unexpected set of pods with delayed deletion (-want,+got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestIntervalsHaveIndex(t *testing.T) {
|
func TestIntervalsHaveIndex(t *testing.T) {
|
||||||
cases := map[string]struct {
|
cases := map[string]struct {
|
||||||
intervals orderedIntervals
|
intervals orderedIntervals
|
||||||
@ -267,6 +696,7 @@ func TestFirstPendingIndexes(t *testing.T) {
|
|||||||
completions int
|
completions int
|
||||||
activePods []indexPhase
|
activePods []indexPhase
|
||||||
succeededIndexes []interval
|
succeededIndexes []interval
|
||||||
|
failedIndexes *orderedIntervals
|
||||||
want []int
|
want []int
|
||||||
}{
|
}{
|
||||||
"cnt greater than completions": {
|
"cnt greater than completions": {
|
||||||
@ -310,12 +740,24 @@ func TestFirstPendingIndexes(t *testing.T) {
|
|||||||
completions: 20,
|
completions: 20,
|
||||||
want: []int{0, 1, 6, 7, 10},
|
want: []int{0, 1, 6, 7, 10},
|
||||||
},
|
},
|
||||||
|
"with failed indexes": {
|
||||||
|
activePods: []indexPhase{
|
||||||
|
{"3", v1.PodPending},
|
||||||
|
{"9", v1.PodPending},
|
||||||
|
},
|
||||||
|
succeededIndexes: []interval{{1, 1}, {5, 5}, {9, 9}},
|
||||||
|
failedIndexes: &orderedIntervals{{2, 2}, {6, 7}},
|
||||||
|
cnt: 5,
|
||||||
|
completions: 20,
|
||||||
|
want: []int{0, 4, 8, 10, 11},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for name, tc := range cases {
|
for name, tc := range cases {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
jobCtx := &syncJobCtx{
|
jobCtx := &syncJobCtx{
|
||||||
activePods: hollowPodsWithIndexPhase(tc.activePods),
|
activePods: hollowPodsWithIndexPhase(tc.activePods),
|
||||||
succeededIndexes: tc.succeededIndexes,
|
succeededIndexes: tc.succeededIndexes,
|
||||||
|
failedIndexes: tc.failedIndexes,
|
||||||
}
|
}
|
||||||
got := firstPendingIndexes(jobCtx, tc.cnt, tc.completions)
|
got := firstPendingIndexes(jobCtx, tc.cnt, tc.completions)
|
||||||
if diff := cmp.Diff(tc.want, got); diff != "" {
|
if diff := cmp.Diff(tc.want, got); diff != "" {
|
||||||
@ -446,6 +888,47 @@ func TestPodGenerateNameWithIndex(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetIndexFailureCount(t *testing.T) {
|
||||||
|
logger, _ := ktesting.NewTestContext(t)
|
||||||
|
cases := map[string]struct {
|
||||||
|
pod *v1.Pod
|
||||||
|
wantResult int32
|
||||||
|
}{
|
||||||
|
"no annotation": {
|
||||||
|
pod: &v1.Pod{},
|
||||||
|
wantResult: 0,
|
||||||
|
},
|
||||||
|
"valid value": {
|
||||||
|
pod: buildPod().indexFailureCount("2").Pod,
|
||||||
|
wantResult: 2,
|
||||||
|
},
|
||||||
|
"valid maxint32 value": {
|
||||||
|
pod: buildPod().indexFailureCount(strconv.Itoa(math.MaxInt32)).Pod,
|
||||||
|
wantResult: math.MaxInt32,
|
||||||
|
},
|
||||||
|
"too large value": {
|
||||||
|
pod: buildPod().indexFailureCount(strconv.Itoa(math.MaxInt32 + 1)).Pod,
|
||||||
|
wantResult: 0,
|
||||||
|
},
|
||||||
|
"negative value": {
|
||||||
|
pod: buildPod().indexFailureCount("-1").Pod,
|
||||||
|
wantResult: 0,
|
||||||
|
},
|
||||||
|
"invalid int value": {
|
||||||
|
pod: buildPod().indexFailureCount("xyz").Pod,
|
||||||
|
wantResult: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
gotResult := getIndexFailureCount(logger, tc.pod)
|
||||||
|
if diff := cmp.Equal(tc.wantResult, gotResult); !diff {
|
||||||
|
t.Errorf("Unexpected result. want: %d, got: %d", tc.wantResult, gotResult)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {
|
func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {
|
||||||
pods := make([]*v1.Pod, 0, len(descs))
|
pods := make([]*v1.Pod, 0, len(descs))
|
||||||
for _, desc := range descs {
|
for _, desc := range descs {
|
||||||
|
@ -132,16 +132,18 @@ type Controller struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type syncJobCtx struct {
|
type syncJobCtx struct {
|
||||||
job *batch.Job
|
job *batch.Job
|
||||||
pods []*v1.Pod
|
pods []*v1.Pod
|
||||||
finishedCondition *batch.JobCondition
|
finishedCondition *batch.JobCondition
|
||||||
activePods []*v1.Pod
|
activePods []*v1.Pod
|
||||||
succeeded int32
|
succeeded int32
|
||||||
prevSucceededIndexes orderedIntervals
|
prevSucceededIndexes orderedIntervals
|
||||||
succeededIndexes orderedIntervals
|
succeededIndexes orderedIntervals
|
||||||
newBackoffRecord backoffRecord
|
failedIndexes *orderedIntervals
|
||||||
expectedRmFinalizers sets.Set[string]
|
newBackoffRecord backoffRecord
|
||||||
uncounted *uncountedTerminatedPods
|
expectedRmFinalizers sets.Set[string]
|
||||||
|
uncounted *uncountedTerminatedPods
|
||||||
|
podsWithDelayedDeletionPerIndex map[int]*v1.Pod
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewController creates a new Job controller that keeps the relevant pods
|
// NewController creates a new Job controller that keeps the relevant pods
|
||||||
@ -835,6 +837,17 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
|
|||||||
if isIndexedJob(&job) {
|
if isIndexedJob(&job) {
|
||||||
jobCtx.prevSucceededIndexes, jobCtx.succeededIndexes = calculateSucceededIndexes(logger, &job, pods)
|
jobCtx.prevSucceededIndexes, jobCtx.succeededIndexes = calculateSucceededIndexes(logger, &job, pods)
|
||||||
jobCtx.succeeded = int32(jobCtx.succeededIndexes.total())
|
jobCtx.succeeded = int32(jobCtx.succeededIndexes.total())
|
||||||
|
if hasBackoffLimitPerIndex(&job) {
|
||||||
|
jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods)
|
||||||
|
if jobCtx.finishedCondition == nil {
|
||||||
|
if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) {
|
||||||
|
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "MaxFailedIndexesExceeded", "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
|
||||||
|
} else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) {
|
||||||
|
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "FailedIndexes", "Job has failed indexes", jm.clock.Now())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
suspendCondChanged := false
|
suspendCondChanged := false
|
||||||
// Remove active pods if Job failed.
|
// Remove active pods if Job failed.
|
||||||
@ -1017,9 +1030,10 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
considerPodFailed := isPodFailed(pod, jobCtx.job)
|
considerPodFailed := isPodFailed(pod, jobCtx.job)
|
||||||
if podutil.IsPodTerminal(pod) || considerPodFailed || jobCtx.finishedCondition != nil || jobCtx.job.DeletionTimestamp != nil {
|
if !canRemoveFinalizer(logger, jobCtx, pod, considerPodFailed) {
|
||||||
podsToRemoveFinalizer = append(podsToRemoveFinalizer, pod)
|
continue
|
||||||
}
|
}
|
||||||
|
podsToRemoveFinalizer = append(podsToRemoveFinalizer, pod)
|
||||||
if pod.Status.Phase == v1.PodSucceeded && !jobCtx.uncounted.failed.Has(string(pod.UID)) {
|
if pod.Status.Phase == v1.PodSucceeded && !jobCtx.uncounted.failed.Has(string(pod.UID)) {
|
||||||
if isIndexed {
|
if isIndexed {
|
||||||
// The completion index is enough to avoid recounting succeeded pods.
|
// The completion index is enough to avoid recounting succeeded pods.
|
||||||
@ -1073,6 +1087,14 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
|||||||
}
|
}
|
||||||
jobCtx.job.Status.Succeeded = int32(jobCtx.succeededIndexes.total())
|
jobCtx.job.Status.Succeeded = int32(jobCtx.succeededIndexes.total())
|
||||||
jobCtx.job.Status.CompletedIndexes = succeededIndexesStr
|
jobCtx.job.Status.CompletedIndexes = succeededIndexesStr
|
||||||
|
var failedIndexesStr *string
|
||||||
|
if jobCtx.failedIndexes != nil {
|
||||||
|
failedIndexesStr = pointer.String(jobCtx.failedIndexes.String())
|
||||||
|
}
|
||||||
|
if !pointer.StringEqual(jobCtx.job.Status.FailedIndexes, failedIndexesStr) {
|
||||||
|
jobCtx.job.Status.FailedIndexes = failedIndexesStr
|
||||||
|
needsFlush = true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
|
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
|
||||||
if jobCtx.finishedCondition != nil && jobCtx.finishedCondition.Type == batch.JobFailureTarget {
|
if jobCtx.finishedCondition != nil && jobCtx.finishedCondition.Type == batch.JobFailureTarget {
|
||||||
@ -1106,6 +1128,32 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// canRemoveFinalizer determines if the pod's finalizer can be safely removed.
|
||||||
|
// The finalizer can be removed when:
|
||||||
|
// - the entire Job is terminating; or
|
||||||
|
// - the pod's index is succeeded; or
|
||||||
|
// - the Pod is considered failed, unless it's removal is delayed for the
|
||||||
|
// purpose of transferring the JobIndexFailureCount annotations to the
|
||||||
|
// replacement pod. the entire Job is terminating the finalizer can be
|
||||||
|
// removed unconditionally.
|
||||||
|
func canRemoveFinalizer(logger klog.Logger, jobCtx *syncJobCtx, pod *v1.Pod, considerPodFailed bool) bool {
|
||||||
|
if jobCtx.job.DeletionTimestamp != nil || jobCtx.finishedCondition != nil || pod.Status.Phase == v1.PodSucceeded {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if !considerPodFailed {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if hasBackoffLimitPerIndex(jobCtx.job) {
|
||||||
|
if index := getCompletionIndex(pod.Annotations); index != unknownCompletionIndex {
|
||||||
|
if p, ok := jobCtx.podsWithDelayedDeletionPerIndex[index]; ok && p.UID == pod.UID {
|
||||||
|
logger.V(3).Info("Delaying pod finalizer removal to await for pod recreation within the index", "pod", klog.KObj(pod))
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// flushUncountedAndRemoveFinalizers does:
|
// flushUncountedAndRemoveFinalizers does:
|
||||||
// 1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition
|
// 1. flush the Job status that might include new uncounted Pod UIDs. Also flush the interim FailureTarget condition
|
||||||
// if present.
|
// if present.
|
||||||
@ -1443,7 +1491,11 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
|
|||||||
}
|
}
|
||||||
|
|
||||||
if active < wantActive {
|
if active < wantActive {
|
||||||
remainingTime := jobCtx.newBackoffRecord.getRemainingTime(jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff)
|
var remainingTime time.Duration
|
||||||
|
if !hasBackoffLimitPerIndex(job) {
|
||||||
|
// we compute the global remaining time for pod creation when backoffLimitPerIndex is not used
|
||||||
|
remainingTime = jobCtx.newBackoffRecord.getRemainingTime(jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff)
|
||||||
|
}
|
||||||
if remainingTime > 0 {
|
if remainingTime > 0 {
|
||||||
jm.enqueueSyncJobWithDelay(logger, job, remainingTime)
|
jm.enqueueSyncJobWithDelay(logger, job, remainingTime)
|
||||||
return 0, metrics.JobSyncActionPodsCreated, nil
|
return 0, metrics.JobSyncActionPodsCreated, nil
|
||||||
@ -1456,6 +1508,13 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
|
|||||||
var indexesToAdd []int
|
var indexesToAdd []int
|
||||||
if isIndexedJob(job) {
|
if isIndexedJob(job) {
|
||||||
indexesToAdd = firstPendingIndexes(jobCtx, int(diff), int(*job.Spec.Completions))
|
indexesToAdd = firstPendingIndexes(jobCtx, int(diff), int(*job.Spec.Completions))
|
||||||
|
if hasBackoffLimitPerIndex(job) {
|
||||||
|
indexesToAdd, remainingTime = jm.getPodCreationInfoForIndependentIndexes(logger, indexesToAdd, jobCtx.podsWithDelayedDeletionPerIndex)
|
||||||
|
if remainingTime > 0 {
|
||||||
|
jm.enqueueSyncJobWithDelay(logger, job, remainingTime)
|
||||||
|
return 0, metrics.JobSyncActionPodsCreated, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
diff = int32(len(indexesToAdd))
|
diff = int32(len(indexesToAdd))
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1502,6 +1561,9 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
|
|||||||
}
|
}
|
||||||
template.Spec.Hostname = fmt.Sprintf("%s-%d", job.Name, completionIndex)
|
template.Spec.Hostname = fmt.Sprintf("%s-%d", job.Name, completionIndex)
|
||||||
generateName = podGenerateNameWithIndex(job.Name, completionIndex)
|
generateName = podGenerateNameWithIndex(job.Name, completionIndex)
|
||||||
|
if hasBackoffLimitPerIndex(job) {
|
||||||
|
addIndexFailureCountAnnotation(logger, template, job, jobCtx.podsWithDelayedDeletionPerIndex[completionIndex])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
defer wait.Done()
|
defer wait.Done()
|
||||||
err := jm.podControl.CreatePodsWithGenerateName(ctx, job.Namespace, template, job, metav1.NewControllerRef(job, controllerKind), generateName)
|
err := jm.podControl.CreatePodsWithGenerateName(ctx, job.Namespace, template, job, metav1.NewControllerRef(job, controllerKind), generateName)
|
||||||
@ -1544,6 +1606,26 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
|
|||||||
return active, metrics.JobSyncActionTracking, nil
|
return active, metrics.JobSyncActionTracking, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getPodCreationInfoForIndependentIndexes returns a sub-list of all indexes
|
||||||
|
// to create that contains those which can be already created. In case no indexes
|
||||||
|
// are ready to create pods, it returns the lowest remaining time to create pods
|
||||||
|
// out of all indexes.
|
||||||
|
func (jm *Controller) getPodCreationInfoForIndependentIndexes(logger klog.Logger, indexesToAdd []int, podsWithDelayedDeletionPerIndex map[int]*v1.Pod) ([]int, time.Duration) {
|
||||||
|
var indexesToAddNow []int
|
||||||
|
var minRemainingTimePerIndex *time.Duration
|
||||||
|
for _, indexToAdd := range indexesToAdd {
|
||||||
|
if remainingTimePerIndex := getRemainingTimePerIndex(logger, jm.clock, DefaultJobPodFailureBackOff, MaxJobPodFailureBackOff, podsWithDelayedDeletionPerIndex[indexToAdd]); remainingTimePerIndex == 0 {
|
||||||
|
indexesToAddNow = append(indexesToAddNow, indexToAdd)
|
||||||
|
} else if minRemainingTimePerIndex == nil || remainingTimePerIndex < *minRemainingTimePerIndex {
|
||||||
|
minRemainingTimePerIndex = &remainingTimePerIndex
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(indexesToAddNow) > 0 {
|
||||||
|
return indexesToAddNow, 0
|
||||||
|
}
|
||||||
|
return indexesToAddNow, pointer.DurationDeref(minRemainingTimePerIndex, 0)
|
||||||
|
}
|
||||||
|
|
||||||
// activePodsForRemoval returns Pods that should be removed because there
|
// activePodsForRemoval returns Pods that should be removed because there
|
||||||
// are too many pods running or, if this is an indexed job, there are repeated
|
// are too many pods running or, if this is an indexed job, there are repeated
|
||||||
// indexes or invalid indexes or some pods don't have indexes.
|
// indexes or invalid indexes or some pods don't have indexes.
|
||||||
@ -1735,7 +1817,7 @@ func recordJobPodFinished(logger klog.Logger, job *batch.Job, oldCounters batch.
|
|||||||
// now out of range (i.e. index >= spec.Completions).
|
// now out of range (i.e. index >= spec.Completions).
|
||||||
if isIndexedJob(job) {
|
if isIndexedJob(job) {
|
||||||
if job.Status.CompletedIndexes != oldCounters.CompletedIndexes {
|
if job.Status.CompletedIndexes != oldCounters.CompletedIndexes {
|
||||||
diff = succeededIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - succeededIndexesFromString(logger, oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total()
|
diff = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - parseIndexesFromString(logger, oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total()
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded)
|
diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded)
|
||||||
|
@ -20,6 +20,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"testing"
|
"testing"
|
||||||
@ -1128,6 +1129,9 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
wantStatusUpdates []batch.JobStatus
|
wantStatusUpdates []batch.JobStatus
|
||||||
wantSucceededPodsMetric int
|
wantSucceededPodsMetric int
|
||||||
wantFailedPodsMetric int
|
wantFailedPodsMetric int
|
||||||
|
|
||||||
|
// features
|
||||||
|
enableJobBackoffLimitPerIndex bool
|
||||||
}{
|
}{
|
||||||
"no updates": {},
|
"no updates": {},
|
||||||
"new active": {
|
"new active": {
|
||||||
@ -1649,9 +1653,91 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
},
|
},
|
||||||
wantFailedPodsMetric: 2,
|
wantFailedPodsMetric: 2,
|
||||||
},
|
},
|
||||||
|
"indexed job with a failed pod with delayed finalizer removal; the pod is not counted": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
CompletionMode: &indexedCompletion,
|
||||||
|
Completions: pointer.Int32(6),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod,
|
||||||
|
},
|
||||||
|
wantStatusUpdates: []batch.JobStatus{
|
||||||
|
{
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"indexed job with a failed pod which is recreated by a running pod; the pod is counted": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
CompletionMode: &indexedCompletion,
|
||||||
|
Completions: pointer.Int32(6),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
Active: 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod,
|
||||||
|
buildPod().uid("a2").phase(v1.PodRunning).indexFailureCount("1").trackingFinalizer().index("1").Pod,
|
||||||
|
},
|
||||||
|
wantRmFinalizers: 1,
|
||||||
|
wantStatusUpdates: []batch.JobStatus{
|
||||||
|
{
|
||||||
|
Active: 1,
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{
|
||||||
|
Failed: []types.UID{"a1"},
|
||||||
|
},
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Active: 1,
|
||||||
|
Failed: 1,
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantFailedPodsMetric: 1,
|
||||||
|
},
|
||||||
|
"indexed job with a failed pod for a failed index; the pod is counted": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
CompletionMode: &indexedCompletion,
|
||||||
|
Completions: pointer.Int32(6),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().index("1").Pod,
|
||||||
|
},
|
||||||
|
wantRmFinalizers: 1,
|
||||||
|
wantStatusUpdates: []batch.JobStatus{
|
||||||
|
{
|
||||||
|
FailedIndexes: pointer.String("1"),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{
|
||||||
|
Failed: []types.UID{"a"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Failed: 1,
|
||||||
|
FailedIndexes: pointer.String("1"),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantFailedPodsMetric: 1,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for name, tc := range cases {
|
for name, tc := range cases {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)()
|
||||||
clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||||
manager, _ := newControllerFromClient(ctx, clientSet, controller.NoResyncPeriodFunc)
|
manager, _ := newControllerFromClient(ctx, clientSet, controller.NoResyncPeriodFunc)
|
||||||
fakePodControl := controller.FakePodControl{Err: tc.podControlErr}
|
fakePodControl := controller.FakePodControl{Err: tc.podControlErr}
|
||||||
@ -1666,20 +1752,22 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
if job.Status.UncountedTerminatedPods == nil {
|
if job.Status.UncountedTerminatedPods == nil {
|
||||||
job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
|
job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
|
||||||
}
|
}
|
||||||
uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods)
|
|
||||||
var succeededIndexes orderedIntervals
|
|
||||||
if isIndexedJob(job) {
|
|
||||||
succeededIndexes = succeededIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions))
|
|
||||||
}
|
|
||||||
jobCtx := &syncJobCtx{
|
jobCtx := &syncJobCtx{
|
||||||
job: job,
|
job: job,
|
||||||
pods: tc.pods,
|
pods: tc.pods,
|
||||||
succeededIndexes: succeededIndexes,
|
uncounted: newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods),
|
||||||
uncounted: uncounted,
|
|
||||||
expectedRmFinalizers: tc.expectedRmFinalizers,
|
expectedRmFinalizers: tc.expectedRmFinalizers,
|
||||||
finishedCondition: tc.finishedCond,
|
finishedCondition: tc.finishedCond,
|
||||||
newBackoffRecord: backoffRecord{},
|
|
||||||
}
|
}
|
||||||
|
if isIndexedJob(job) {
|
||||||
|
jobCtx.succeededIndexes = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions))
|
||||||
|
if tc.enableJobBackoffLimitPerIndex && job.Spec.BackoffLimitPerIndex != nil {
|
||||||
|
jobCtx.failedIndexes = calculateFailedIndexes(logger, job, tc.pods)
|
||||||
|
jobCtx.activePods = controller.FilterActivePods(logger, tc.pods)
|
||||||
|
jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
err := manager.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, tc.needsFlush)
|
err := manager.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, tc.needsFlush)
|
||||||
if !errors.Is(err, tc.wantErr) {
|
if !errors.Is(err, tc.wantErr) {
|
||||||
t.Errorf("Got error %v, want %v", err, tc.wantErr)
|
t.Errorf("Got error %v, want %v", err, tc.wantErr)
|
||||||
@ -3123,6 +3211,484 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
|
||||||
|
_, ctx := ktesting.NewTestContext(t)
|
||||||
|
now := time.Now()
|
||||||
|
validObjectMeta := metav1.ObjectMeta{
|
||||||
|
Name: "foobar",
|
||||||
|
UID: uuid.NewUUID(),
|
||||||
|
Namespace: metav1.NamespaceDefault,
|
||||||
|
}
|
||||||
|
validSelector := &metav1.LabelSelector{
|
||||||
|
MatchLabels: map[string]string{"foo": "bar"},
|
||||||
|
}
|
||||||
|
validTemplate := v1.PodTemplateSpec{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Labels: map[string]string{
|
||||||
|
"foo": "bar",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{Image: "foo/bar"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
testCases := map[string]struct {
|
||||||
|
enableJobBackoffLimitPerIndex bool
|
||||||
|
enableJobPodFailurePolicy bool
|
||||||
|
job batch.Job
|
||||||
|
pods []v1.Pod
|
||||||
|
wantStatus batch.JobStatus
|
||||||
|
}{
|
||||||
|
"successful job after a single failure within index": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a1").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("a2").index("0").phase(v1.PodSucceeded).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Failed: 1,
|
||||||
|
Succeeded: 2,
|
||||||
|
CompletedIndexes: "0,1",
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
Conditions: []batch.JobCondition{
|
||||||
|
{
|
||||||
|
Type: batch.JobComplete,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"single failed pod, not counted as the replacement pod creation is delayed": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 2,
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"single failed pod replaced already": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("b").index("0").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 1,
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"single failed index due to exceeding the backoff limit per index, the job continues": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 1,
|
||||||
|
Failed: 1,
|
||||||
|
FailedIndexes: pointer.String("0"),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"single failed index due to FailIndex action, the job continues": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
enableJobPodFailurePolicy: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").status(v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 1,
|
||||||
|
Failed: 1,
|
||||||
|
FailedIndexes: pointer.String("0"),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"job failed index due to FailJob action": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
enableJobPodFailurePolicy: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(6),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailJob,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").status(v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
Name: "x",
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 0,
|
||||||
|
Failed: 1,
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
Conditions: []batch.JobCondition{
|
||||||
|
{
|
||||||
|
Type: batch.JobFailureTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: "PodFailurePolicy",
|
||||||
|
Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Type: batch.JobFailed,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: "PodFailurePolicy",
|
||||||
|
Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"job pod failure ignored due to matching Ignore action": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
enableJobPodFailurePolicy: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(6),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
PodFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionIgnore,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").status(v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
Name: "x",
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 3,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 0,
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"job failed due to exceeding backoffLimit before backoffLimitPerIndex": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(1),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("b").index("1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Failed: 2,
|
||||||
|
Succeeded: 0,
|
||||||
|
FailedIndexes: pointer.String(""),
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
Conditions: []batch.JobCondition{
|
||||||
|
{
|
||||||
|
Type: batch.JobFailed,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: "BackoffLimitExceeded",
|
||||||
|
Message: "Job has reached the specified backoff limit",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"job failed due to failed indexes": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Failed: 1,
|
||||||
|
Succeeded: 1,
|
||||||
|
FailedIndexes: pointer.String("0"),
|
||||||
|
CompletedIndexes: "1",
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
Conditions: []batch.JobCondition{
|
||||||
|
{
|
||||||
|
Type: batch.JobFailed,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: "FailedIndexes",
|
||||||
|
Message: "Job has failed indexes",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"job failed due to exceeding max failed indexes": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(4),
|
||||||
|
Completions: pointer.Int32(4),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
MaxFailedIndexes: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("c").index("2").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
*buildPod().uid("d").index("3").phase(v1.PodRunning).indexFailureCount("0").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Failed: 3,
|
||||||
|
Succeeded: 1,
|
||||||
|
FailedIndexes: pointer.String("0,2"),
|
||||||
|
CompletedIndexes: "1",
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
Conditions: []batch.JobCondition{
|
||||||
|
{
|
||||||
|
Type: batch.JobFailed,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
Reason: "MaxFailedIndexesExceeded",
|
||||||
|
Message: "Job has exceeded the specified maximal number of failed indexes",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"job with finished indexes; failedIndexes are cleaned when JobBackoffLimitPerIndex disabled": {
|
||||||
|
enableJobBackoffLimitPerIndex: false,
|
||||||
|
job: batch.Job{
|
||||||
|
TypeMeta: metav1.TypeMeta{Kind: "Job"},
|
||||||
|
ObjectMeta: validObjectMeta,
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
Selector: validSelector,
|
||||||
|
Template: validTemplate,
|
||||||
|
Parallelism: pointer.Int32(3),
|
||||||
|
Completions: pointer.Int32(3),
|
||||||
|
BackoffLimit: pointer.Int32(math.MaxInt32),
|
||||||
|
CompletionMode: completionModePtr(batch.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
},
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
FailedIndexes: pointer.String("0"),
|
||||||
|
CompletedIndexes: "1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []v1.Pod{
|
||||||
|
*buildPod().uid("c").index("2").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod,
|
||||||
|
},
|
||||||
|
wantStatus: batch.JobStatus{
|
||||||
|
Active: 2,
|
||||||
|
Succeeded: 1,
|
||||||
|
CompletedIndexes: "1",
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range testCases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)()
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
|
||||||
|
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||||
|
fakeClock := clocktesting.NewFakeClock(now)
|
||||||
|
manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, clientset, controller.NoResyncPeriodFunc, fakeClock)
|
||||||
|
fakePodControl := controller.FakePodControl{}
|
||||||
|
manager.podControl = &fakePodControl
|
||||||
|
manager.podStoreSynced = alwaysReady
|
||||||
|
manager.jobStoreSynced = alwaysReady
|
||||||
|
job := &tc.job
|
||||||
|
|
||||||
|
actual := job
|
||||||
|
manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
|
||||||
|
actual = job
|
||||||
|
return job, nil
|
||||||
|
}
|
||||||
|
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
|
||||||
|
for i, pod := range tc.pods {
|
||||||
|
pod := pod
|
||||||
|
pb := podBuilder{Pod: &pod}.name(fmt.Sprintf("mypod-%d", i)).job(job)
|
||||||
|
if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion {
|
||||||
|
pb.index(fmt.Sprintf("%v", getCompletionIndex(pod.Annotations)))
|
||||||
|
}
|
||||||
|
pb = pb.trackingFinalizer()
|
||||||
|
sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod)
|
||||||
|
}
|
||||||
|
|
||||||
|
manager.syncJob(context.TODO(), testutil.GetKey(job, t))
|
||||||
|
|
||||||
|
// validate relevant fields of the status
|
||||||
|
if diff := cmp.Diff(tc.wantStatus, actual.Status,
|
||||||
|
cmpopts.IgnoreFields(batch.JobStatus{}, "StartTime", "CompletionTime", "Ready"),
|
||||||
|
cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
|
||||||
|
t.Errorf("unexpected job status. Diff: %s\n", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestSyncJobUpdateRequeue(t *testing.T) {
|
func TestSyncJobUpdateRequeue(t *testing.T) {
|
||||||
_, ctx := ktesting.NewTestContext(t)
|
_, ctx := ktesting.NewTestContext(t)
|
||||||
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||||
@ -3217,6 +3783,69 @@ func TestUpdateJobRequeue(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetPodCreationInfoForIndependentIndexes(t *testing.T) {
|
||||||
|
logger, ctx := ktesting.NewTestContext(t)
|
||||||
|
now := time.Now()
|
||||||
|
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||||
|
cases := map[string]struct {
|
||||||
|
indexesToAdd []int
|
||||||
|
podsWithDelayedDeletionPerIndex map[int]*v1.Pod
|
||||||
|
wantIndexesToAdd []int
|
||||||
|
wantRemainingTime time.Duration
|
||||||
|
}{
|
||||||
|
"simple index creation": {
|
||||||
|
indexesToAdd: []int{1, 3},
|
||||||
|
wantIndexesToAdd: []int{1, 3},
|
||||||
|
},
|
||||||
|
"subset of indexes can be recreated now": {
|
||||||
|
indexesToAdd: []int{1, 3},
|
||||||
|
podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
|
||||||
|
1: buildPod().indexFailureCount("0").index("1").customDeletionTimestamp(now).Pod,
|
||||||
|
},
|
||||||
|
wantIndexesToAdd: []int{3},
|
||||||
|
},
|
||||||
|
"subset of indexes can be recreated now as the pods failed long time ago": {
|
||||||
|
indexesToAdd: []int{1, 3},
|
||||||
|
podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
|
||||||
|
1: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod,
|
||||||
|
3: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-DefaultJobPodFailureBackOff)).Pod,
|
||||||
|
},
|
||||||
|
wantIndexesToAdd: []int{3},
|
||||||
|
},
|
||||||
|
"no indexes can be recreated now, need to wait default pod failure backoff": {
|
||||||
|
indexesToAdd: []int{1, 2, 3},
|
||||||
|
podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
|
||||||
|
1: buildPod().indexFailureCount("1").customDeletionTimestamp(now).Pod,
|
||||||
|
2: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod,
|
||||||
|
3: buildPod().indexFailureCount("2").customDeletionTimestamp(now).Pod,
|
||||||
|
},
|
||||||
|
wantRemainingTime: DefaultJobPodFailureBackOff,
|
||||||
|
},
|
||||||
|
"no indexes can be recreated now, need to wait but 1s already passed": {
|
||||||
|
indexesToAdd: []int{1, 2, 3},
|
||||||
|
podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
|
||||||
|
1: buildPod().indexFailureCount("1").customDeletionTimestamp(now.Add(-time.Second)).Pod,
|
||||||
|
2: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-time.Second)).Pod,
|
||||||
|
3: buildPod().indexFailureCount("2").customDeletionTimestamp(now.Add(-time.Second)).Pod,
|
||||||
|
},
|
||||||
|
wantRemainingTime: DefaultJobPodFailureBackOff - time.Second,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, tc := range cases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
fakeClock := clocktesting.NewFakeClock(now)
|
||||||
|
manager, _ := newControllerFromClientWithClock(ctx, clientset, controller.NoResyncPeriodFunc, fakeClock)
|
||||||
|
gotIndexesToAdd, gotRemainingTime := manager.getPodCreationInfoForIndependentIndexes(logger, tc.indexesToAdd, tc.podsWithDelayedDeletionPerIndex)
|
||||||
|
if diff := cmp.Diff(tc.wantIndexesToAdd, gotIndexesToAdd); diff != "" {
|
||||||
|
t.Fatalf("Unexpected indexes to add: %s", diff)
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(tc.wantRemainingTime, gotRemainingTime); diff != "" {
|
||||||
|
t.Fatalf("Unexpected remaining time: %s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestJobPodLookup(t *testing.T) {
|
func TestJobPodLookup(t *testing.T) {
|
||||||
_, ctx := ktesting.NewTestContext(t)
|
_, ctx := ktesting.NewTestContext(t)
|
||||||
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
|
||||||
@ -4541,10 +5170,27 @@ func (pb podBuilder) clearLabels() podBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (pb podBuilder) index(ix string) podBuilder {
|
func (pb podBuilder) index(ix string) podBuilder {
|
||||||
|
return pb.annotation(batch.JobCompletionIndexAnnotation, ix)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pb podBuilder) indexFailureCount(count string) podBuilder {
|
||||||
|
return pb.annotation(batch.JobIndexFailureCountAnnotation, count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pb podBuilder) indexIgnoredFailureCount(count string) podBuilder {
|
||||||
|
return pb.annotation(batch.JobIndexIgnoredFailureCountAnnotation, count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pb podBuilder) annotation(key, value string) podBuilder {
|
||||||
if pb.Annotations == nil {
|
if pb.Annotations == nil {
|
||||||
pb.Annotations = make(map[string]string)
|
pb.Annotations = make(map[string]string)
|
||||||
}
|
}
|
||||||
pb.Annotations[batch.JobCompletionIndexAnnotation] = ix
|
pb.Annotations[key] = value
|
||||||
|
return pb
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pb podBuilder) status(s v1.PodStatus) podBuilder {
|
||||||
|
pb.Status = s
|
||||||
return pb
|
return pb
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4568,6 +5214,15 @@ func (pb podBuilder) deletionTimestamp() podBuilder {
|
|||||||
return pb
|
return pb
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pb podBuilder) customDeletionTimestamp(t time.Time) podBuilder {
|
||||||
|
pb.DeletionTimestamp = &metav1.Time{Time: t}
|
||||||
|
return pb
|
||||||
|
}
|
||||||
|
|
||||||
|
func completionModePtr(m batch.CompletionMode) *batch.CompletionMode {
|
||||||
|
return &m
|
||||||
|
}
|
||||||
|
|
||||||
func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() {
|
func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() {
|
||||||
origVal := *val
|
origVal := *val
|
||||||
*val = newVal
|
*val = newVal
|
||||||
|
@ -21,20 +21,24 @@ import (
|
|||||||
|
|
||||||
batch "k8s.io/api/batch/v1"
|
batch "k8s.io/api/batch/v1"
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apiserver/pkg/util/feature"
|
||||||
|
"k8s.io/kubernetes/pkg/features"
|
||||||
)
|
)
|
||||||
|
|
||||||
// matchPodFailurePolicy returns information about matching a given failed pod
|
// matchPodFailurePolicy returns information about matching a given failed pod
|
||||||
// against the pod failure policy rules. The information is represented as an
|
// against the pod failure policy rules. The information is represented as an
|
||||||
// optional job failure message (present in case the pod matched a 'FailJob'
|
// - optional job failure message (present in case the pod matched a 'FailJob' rule),
|
||||||
// rule), a boolean indicating if the failure should be counted towards
|
// - a boolean indicating if the failure should be counted towards backoffLimit
|
||||||
// backoffLimit (it should not be counted if the pod matched an 'Ignore' rule),
|
// (and backoffLimitPerIndex if specified). It should not be counted
|
||||||
// and a pointer to the matched pod failure policy action.
|
// if the pod matched an 'Ignore' rule,
|
||||||
|
// - a pointer to the matched pod failure policy action.
|
||||||
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) {
|
func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *v1.Pod) (*string, bool, *batch.PodFailurePolicyAction) {
|
||||||
if podFailurePolicy == nil {
|
if podFailurePolicy == nil {
|
||||||
return nil, true, nil
|
return nil, true, nil
|
||||||
}
|
}
|
||||||
ignore := batch.PodFailurePolicyActionIgnore
|
ignore := batch.PodFailurePolicyActionIgnore
|
||||||
failJob := batch.PodFailurePolicyActionFailJob
|
failJob := batch.PodFailurePolicyActionFailJob
|
||||||
|
failIndex := batch.PodFailurePolicyActionFailIndex
|
||||||
count := batch.PodFailurePolicyActionCount
|
count := batch.PodFailurePolicyActionCount
|
||||||
for index, podFailurePolicyRule := range podFailurePolicy.Rules {
|
for index, podFailurePolicyRule := range podFailurePolicy.Rules {
|
||||||
if podFailurePolicyRule.OnExitCodes != nil {
|
if podFailurePolicyRule.OnExitCodes != nil {
|
||||||
@ -42,6 +46,10 @@ func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *
|
|||||||
switch podFailurePolicyRule.Action {
|
switch podFailurePolicyRule.Action {
|
||||||
case batch.PodFailurePolicyActionIgnore:
|
case batch.PodFailurePolicyActionIgnore:
|
||||||
return nil, false, &ignore
|
return nil, false, &ignore
|
||||||
|
case batch.PodFailurePolicyActionFailIndex:
|
||||||
|
if feature.DefaultFeatureGate.Enabled(features.JobBackoffLimitPerIndex) {
|
||||||
|
return nil, true, &failIndex
|
||||||
|
}
|
||||||
case batch.PodFailurePolicyActionCount:
|
case batch.PodFailurePolicyActionCount:
|
||||||
return nil, true, &count
|
return nil, true, &count
|
||||||
case batch.PodFailurePolicyActionFailJob:
|
case batch.PodFailurePolicyActionFailJob:
|
||||||
@ -55,6 +63,10 @@ func matchPodFailurePolicy(podFailurePolicy *batch.PodFailurePolicy, failedPod *
|
|||||||
switch podFailurePolicyRule.Action {
|
switch podFailurePolicyRule.Action {
|
||||||
case batch.PodFailurePolicyActionIgnore:
|
case batch.PodFailurePolicyActionIgnore:
|
||||||
return nil, false, &ignore
|
return nil, false, &ignore
|
||||||
|
case batch.PodFailurePolicyActionFailIndex:
|
||||||
|
if feature.DefaultFeatureGate.Enabled(features.JobBackoffLimitPerIndex) {
|
||||||
|
return nil, true, &failIndex
|
||||||
|
}
|
||||||
case batch.PodFailurePolicyActionCount:
|
case batch.PodFailurePolicyActionCount:
|
||||||
return nil, true, &count
|
return nil, true, &count
|
||||||
case batch.PodFailurePolicyActionFailJob:
|
case batch.PodFailurePolicyActionFailJob:
|
||||||
|
@ -23,7 +23,10 @@ import (
|
|||||||
batch "k8s.io/api/batch/v1"
|
batch "k8s.io/api/batch/v1"
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||||
|
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||||
_ "k8s.io/kubernetes/pkg/apis/core/install"
|
_ "k8s.io/kubernetes/pkg/apis/core/install"
|
||||||
|
"k8s.io/kubernetes/pkg/features"
|
||||||
"k8s.io/utils/pointer"
|
"k8s.io/utils/pointer"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -34,14 +37,16 @@ func TestMatchPodFailurePolicy(t *testing.T) {
|
|||||||
}
|
}
|
||||||
ignore := batch.PodFailurePolicyActionIgnore
|
ignore := batch.PodFailurePolicyActionIgnore
|
||||||
failJob := batch.PodFailurePolicyActionFailJob
|
failJob := batch.PodFailurePolicyActionFailJob
|
||||||
|
failIndex := batch.PodFailurePolicyActionFailIndex
|
||||||
count := batch.PodFailurePolicyActionCount
|
count := batch.PodFailurePolicyActionCount
|
||||||
|
|
||||||
testCases := map[string]struct {
|
testCases := map[string]struct {
|
||||||
podFailurePolicy *batch.PodFailurePolicy
|
enableJobBackoffLimitPerIndex bool
|
||||||
failedPod *v1.Pod
|
podFailurePolicy *batch.PodFailurePolicy
|
||||||
wantJobFailureMessage *string
|
failedPod *v1.Pod
|
||||||
wantCountFailed bool
|
wantJobFailureMessage *string
|
||||||
wantAction *batch.PodFailurePolicyAction
|
wantCountFailed bool
|
||||||
|
wantAction *batch.PodFailurePolicyAction
|
||||||
}{
|
}{
|
||||||
"unknown action for rule matching by exit codes - skip rule with unknown action": {
|
"unknown action for rule matching by exit codes - skip rule with unknown action": {
|
||||||
podFailurePolicy: &batch.PodFailurePolicy{
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
@ -292,6 +297,68 @@ func TestMatchPodFailurePolicy(t *testing.T) {
|
|||||||
wantJobFailureMessage: nil,
|
wantJobFailureMessage: nil,
|
||||||
wantCountFailed: true,
|
wantCountFailed: true,
|
||||||
},
|
},
|
||||||
|
"FailIndex rule matched for exit codes; JobBackoffLimitPerIndex enabled": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{1, 2, 3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
failedPod: &v1.Pod{
|
||||||
|
ObjectMeta: validPodObjectMeta,
|
||||||
|
Status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantCountFailed: true,
|
||||||
|
wantAction: &failIndex,
|
||||||
|
},
|
||||||
|
"FailIndex rule matched for exit codes; JobBackoffLimitPerIndex disabled": {
|
||||||
|
enableJobBackoffLimitPerIndex: false,
|
||||||
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batch.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{1, 2, 3},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
failedPod: &v1.Pod{
|
||||||
|
ObjectMeta: validPodObjectMeta,
|
||||||
|
Status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantCountFailed: true,
|
||||||
|
wantAction: nil,
|
||||||
|
},
|
||||||
"pod failure policy with NotIn operator and value 0": {
|
"pod failure policy with NotIn operator and value 0": {
|
||||||
podFailurePolicy: &batch.PodFailurePolicy{
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
Rules: []batch.PodFailurePolicyRule{
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
@ -406,6 +473,66 @@ func TestMatchPodFailurePolicy(t *testing.T) {
|
|||||||
wantCountFailed: true,
|
wantCountFailed: true,
|
||||||
wantAction: &count,
|
wantAction: &count,
|
||||||
},
|
},
|
||||||
|
"FailIndex rule matched for pod conditions; JobBackoffLimitPerIndex enabled": {
|
||||||
|
enableJobBackoffLimitPerIndex: true,
|
||||||
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
failedPod: &v1.Pod{
|
||||||
|
ObjectMeta: validPodObjectMeta,
|
||||||
|
Status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
Conditions: []v1.PodCondition{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantCountFailed: true,
|
||||||
|
wantAction: &failIndex,
|
||||||
|
},
|
||||||
|
"FailIndex rule matched for pod conditions; JobBackoffLimitPerIndex disabled": {
|
||||||
|
enableJobBackoffLimitPerIndex: false,
|
||||||
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batch.PodFailurePolicyActionFailIndex,
|
||||||
|
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
failedPod: &v1.Pod{
|
||||||
|
ObjectMeta: validPodObjectMeta,
|
||||||
|
Status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
Conditions: []v1.PodCondition{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantCountFailed: true,
|
||||||
|
wantAction: nil,
|
||||||
|
},
|
||||||
"ignore rule matched for pod conditions": {
|
"ignore rule matched for pod conditions": {
|
||||||
podFailurePolicy: &batch.PodFailurePolicy{
|
podFailurePolicy: &batch.PodFailurePolicy{
|
||||||
Rules: []batch.PodFailurePolicyRule{
|
Rules: []batch.PodFailurePolicyRule{
|
||||||
@ -709,6 +836,7 @@ func TestMatchPodFailurePolicy(t *testing.T) {
|
|||||||
}
|
}
|
||||||
for name, tc := range testCases {
|
for name, tc := range testCases {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)()
|
||||||
jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
|
jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
|
||||||
if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" {
|
if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" {
|
||||||
t.Errorf("Unexpected job failure message: %s", diff)
|
t.Errorf("Unexpected job failure message: %s", diff)
|
||||||
|
@ -682,6 +682,633 @@ func TestJobPodFailurePolicy(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestBackoffLimitPerIndex_DelayedPodDeletion tests the pod deletion is delayed
|
||||||
|
// until the replacement pod is created, so that the replacement pod has the
|
||||||
|
// index-failure-count annotation bumped, when BackoffLimitPerIndex is used.
|
||||||
|
func TestBackoffLimitPerIndex_DelayedPodDeletion(t *testing.T) {
|
||||||
|
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
|
||||||
|
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-failed")
|
||||||
|
defer closeFn()
|
||||||
|
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
|
||||||
|
defer func() {
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(1),
|
||||||
|
Completions: pointer.Int32(1),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create Job: %v", err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", pointer.String(""))
|
||||||
|
|
||||||
|
// First pod from index 0 failed.
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
|
||||||
|
t.Fatal("Failed trying to fail pod with index 0")
|
||||||
|
}
|
||||||
|
// Delete the failed pod
|
||||||
|
pod, err := getJobPodForIndex(ctx, clientSet, jobObj, 0, func(_ *v1.Pod) bool { return true })
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to get terminal pod for index: %v", 0)
|
||||||
|
}
|
||||||
|
if err := clientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
|
||||||
|
t.Fatalf("failed to delete pod: %v, error: %v", klog.KObj(pod), err)
|
||||||
|
}
|
||||||
|
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 1,
|
||||||
|
Failed: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", pointer.String(""))
|
||||||
|
|
||||||
|
// Verify the replacement pod is created and has the index-failure-count
|
||||||
|
// annotation bumped.
|
||||||
|
replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, 0)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", 0, err)
|
||||||
|
}
|
||||||
|
gotIndexFailureCount, err := getIndexFailureCount(replacement)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err)
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(1, gotIndexFailureCount); diff != "" {
|
||||||
|
t.Errorf("Unexpected index failure count for the replacement pod: %s", diff)
|
||||||
|
}
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
|
||||||
|
t.Fatal("Failed trying to fail pod with index 0")
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 0,
|
||||||
|
Succeeded: 1,
|
||||||
|
Failed: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBackoffLimitPerIndex_Reenabling tests handling of pod failures when
|
||||||
|
// reenabling the BackoffLimitPerIndex feature.
|
||||||
|
func TestBackoffLimitPerIndex_Reenabling(t *testing.T) {
|
||||||
|
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
|
||||||
|
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-reenabled")
|
||||||
|
defer closeFn()
|
||||||
|
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
|
||||||
|
defer cancel()
|
||||||
|
resetMetrics()
|
||||||
|
|
||||||
|
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(3),
|
||||||
|
Completions: pointer.Int32(3),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to create Job: %v", err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 3,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", pointer.String(""))
|
||||||
|
|
||||||
|
// First pod from index 0 failed
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
|
||||||
|
t.Fatal("Failed trying to fail pod with index 0")
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1, 2), "", pointer.String("0"))
|
||||||
|
|
||||||
|
// Disable the feature
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, false)()
|
||||||
|
|
||||||
|
// First pod from index 1 failed
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
|
||||||
|
t.Fatal("Failed trying to fail pod with index 1")
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 3,
|
||||||
|
Failed: 2,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
|
||||||
|
|
||||||
|
// Reenable the feature
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
|
||||||
|
// First pod from index 2 failed
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
|
||||||
|
t.Fatal("Failed trying to fail pod with index 2")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify the indexes 0 and 1 are active as the failed pods don't have
|
||||||
|
// finalizers at this point, so they are ignored.
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 3,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String("2"))
|
||||||
|
|
||||||
|
// mark remaining pods are Succeeded and verify Job status
|
||||||
|
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err)
|
||||||
|
}
|
||||||
|
validateJobFailed(ctx, t, clientSet, jobObj)
|
||||||
|
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff tests that the
|
||||||
|
// pods are recreated with expotential backoff delay computed independently
|
||||||
|
// per index. Scenario:
|
||||||
|
// - fail index 0
|
||||||
|
// - fail index 0
|
||||||
|
// - fail index 1
|
||||||
|
// - succeed index 0
|
||||||
|
// - fail index 1
|
||||||
|
// - succeed index 1
|
||||||
|
func TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff(t *testing.T) {
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second))
|
||||||
|
|
||||||
|
closeFn, restConfig, clientSet, ns := setup(t, "simple")
|
||||||
|
defer closeFn()
|
||||||
|
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(2),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Could not create job: %v", err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 2,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String(""))
|
||||||
|
|
||||||
|
// Fail the first pod for index 0
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String(""))
|
||||||
|
|
||||||
|
// Fail the second pod for index 0
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 2,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String(""))
|
||||||
|
|
||||||
|
// Fail the first pod for index 1
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 2,
|
||||||
|
Failed: 3,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", pointer.String(""))
|
||||||
|
|
||||||
|
// Succeed the third pod for index 0
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 1,
|
||||||
|
Failed: 3,
|
||||||
|
Succeeded: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", pointer.String(""))
|
||||||
|
|
||||||
|
// Fail the second pod for index 1
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 1,
|
||||||
|
Failed: 4,
|
||||||
|
Succeeded: 1,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", pointer.String(""))
|
||||||
|
|
||||||
|
// Succeed the third pod for index 1
|
||||||
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: 0,
|
||||||
|
Failed: 4,
|
||||||
|
Succeeded: 2,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0,1", pointer.String(""))
|
||||||
|
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
||||||
|
|
||||||
|
for index := 0; index < int(*jobObj.Spec.Completions); index++ {
|
||||||
|
podsForIndex, err := getJobPodsForIndex(ctx, clientSet, jobObj, index, func(_ *v1.Pod) bool { return true })
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to list job %q pods for index %v, error: %v", klog.KObj(jobObj), index, err)
|
||||||
|
}
|
||||||
|
validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, podsForIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBackoffLimitPerIndex tests handling of job and its pods when
|
||||||
|
// backoff limit per index is used.
|
||||||
|
func TestBackoffLimitPerIndex(t *testing.T) {
|
||||||
|
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
|
||||||
|
|
||||||
|
type podTerminationWithExpectations struct {
|
||||||
|
index int
|
||||||
|
status v1.PodStatus
|
||||||
|
wantActive int
|
||||||
|
wantFailed int
|
||||||
|
wantSucceeded int
|
||||||
|
wantActiveIndexes sets.Set[int]
|
||||||
|
wantCompletedIndexes string
|
||||||
|
wantFailedIndexes *string
|
||||||
|
wantReplacementPodFailureCount *int
|
||||||
|
}
|
||||||
|
|
||||||
|
podTemplateSpec := v1.PodTemplateSpec{
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: "main-container",
|
||||||
|
Image: "foo",
|
||||||
|
ImagePullPolicy: v1.PullIfNotPresent,
|
||||||
|
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
testCases := map[string]struct {
|
||||||
|
job batchv1.Job
|
||||||
|
podTerminations []podTerminationWithExpectations
|
||||||
|
wantJobConditionType batchv1.JobConditionType
|
||||||
|
}{
|
||||||
|
"job succeeded": {
|
||||||
|
job: batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
Template: podTemplateSpec,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
podTerminations: []podTerminationWithExpectations{
|
||||||
|
{
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 2,
|
||||||
|
wantFailed: 1,
|
||||||
|
wantActiveIndexes: sets.New(0, 1),
|
||||||
|
wantFailedIndexes: pointer.String(""),
|
||||||
|
wantReplacementPodFailureCount: pointer.Int(1),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantJobConditionType: batchv1.JobComplete,
|
||||||
|
},
|
||||||
|
"job index fails due to exceeding backoff limit per index": {
|
||||||
|
job: batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(2),
|
||||||
|
Template: podTemplateSpec,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
podTerminations: []podTerminationWithExpectations{
|
||||||
|
{
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 2,
|
||||||
|
wantFailed: 1,
|
||||||
|
wantActiveIndexes: sets.New(0, 1),
|
||||||
|
wantFailedIndexes: pointer.String(""),
|
||||||
|
wantReplacementPodFailureCount: pointer.Int(1),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 2,
|
||||||
|
wantFailed: 2,
|
||||||
|
wantActiveIndexes: sets.New(0, 1),
|
||||||
|
wantFailedIndexes: pointer.String(""),
|
||||||
|
wantReplacementPodFailureCount: pointer.Int(2),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 1,
|
||||||
|
wantFailed: 3,
|
||||||
|
wantActiveIndexes: sets.New(1),
|
||||||
|
wantFailedIndexes: pointer.String("0"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantJobConditionType: batchv1.JobFailed,
|
||||||
|
},
|
||||||
|
"job index fails due to exceeding the global backoff limit first": {
|
||||||
|
job: batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(3),
|
||||||
|
Completions: pointer.Int32(3),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
BackoffLimit: pointer.Int32(2),
|
||||||
|
Template: podTemplateSpec,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
podTerminations: []podTerminationWithExpectations{
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 3,
|
||||||
|
wantFailed: 1,
|
||||||
|
wantActiveIndexes: sets.New(0, 1, 2),
|
||||||
|
wantFailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
index: 1,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 3,
|
||||||
|
wantFailed: 2,
|
||||||
|
wantActiveIndexes: sets.New(0, 1, 2),
|
||||||
|
wantFailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
index: 2,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantFailed: 5,
|
||||||
|
wantFailedIndexes: pointer.String(""),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantJobConditionType: batchv1.JobFailed,
|
||||||
|
},
|
||||||
|
"job continues execution after a failed index, the job is marked Failed due to the failed index": {
|
||||||
|
job: batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
Template: podTemplateSpec,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
podTerminations: []podTerminationWithExpectations{
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 1,
|
||||||
|
wantFailed: 1,
|
||||||
|
wantActiveIndexes: sets.New(1),
|
||||||
|
wantFailedIndexes: pointer.String("0"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
index: 1,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodSucceeded,
|
||||||
|
},
|
||||||
|
wantFailed: 1,
|
||||||
|
wantSucceeded: 1,
|
||||||
|
wantFailedIndexes: pointer.String("0"),
|
||||||
|
wantCompletedIndexes: "1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantJobConditionType: batchv1.JobFailed,
|
||||||
|
},
|
||||||
|
"job execution terminated early due to exceeding max failed indexes": {
|
||||||
|
job: batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(3),
|
||||||
|
Completions: pointer.Int32(3),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(0),
|
||||||
|
MaxFailedIndexes: pointer.Int32(1),
|
||||||
|
Template: podTemplateSpec,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
podTerminations: []podTerminationWithExpectations{
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 2,
|
||||||
|
wantFailed: 1,
|
||||||
|
wantActiveIndexes: sets.New(1, 2),
|
||||||
|
wantFailedIndexes: pointer.String("0"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
index: 1,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
},
|
||||||
|
wantActive: 0,
|
||||||
|
wantFailed: 3,
|
||||||
|
wantFailedIndexes: pointer.String("0,1"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantJobConditionType: batchv1.JobFailed,
|
||||||
|
},
|
||||||
|
"pod failure matching pod failure policy rule with FailIndex action": {
|
||||||
|
job: batchv1.Job{
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
CompletionMode: completionModePtr(batchv1.IndexedCompletion),
|
||||||
|
BackoffLimitPerIndex: pointer.Int32(1),
|
||||||
|
Template: podTemplateSpec,
|
||||||
|
PodFailurePolicy: &batchv1.PodFailurePolicy{
|
||||||
|
Rules: []batchv1.PodFailurePolicyRule{
|
||||||
|
{
|
||||||
|
Action: batchv1.PodFailurePolicyActionFailIndex,
|
||||||
|
OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
|
||||||
|
Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
|
||||||
|
Values: []int32{13},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Action: batchv1.PodFailurePolicyActionFailIndex,
|
||||||
|
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
podTerminations: []podTerminationWithExpectations{
|
||||||
|
{
|
||||||
|
index: 0,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
ContainerStatuses: []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
ExitCode: 13,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantActive: 1,
|
||||||
|
wantFailed: 1,
|
||||||
|
wantActiveIndexes: sets.New(1),
|
||||||
|
wantFailedIndexes: pointer.String("0"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
index: 1,
|
||||||
|
status: v1.PodStatus{
|
||||||
|
Phase: v1.PodFailed,
|
||||||
|
Conditions: []v1.PodCondition{
|
||||||
|
{
|
||||||
|
Type: v1.DisruptionTarget,
|
||||||
|
Status: v1.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantFailed: 2,
|
||||||
|
wantFailedIndexes: pointer.String("0,1"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantJobConditionType: batchv1.JobFailed,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, test := range testCases {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
resetMetrics()
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)()
|
||||||
|
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
|
||||||
|
|
||||||
|
closeFn, restConfig, clientSet, ns := setup(t, "simple")
|
||||||
|
defer closeFn()
|
||||||
|
ctx, cancel := startJobControllerAndWaitForCaches(restConfig)
|
||||||
|
defer func() {
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: int(*test.job.Spec.Parallelism),
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
for _, podTermination := range test.podTerminations {
|
||||||
|
pod, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("listing Job Pods: %q", err)
|
||||||
|
}
|
||||||
|
pod.Status = podTermination.status
|
||||||
|
if _, err = clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil {
|
||||||
|
t.Fatalf("Error updating the pod %q: %q", klog.KObj(pod), err)
|
||||||
|
}
|
||||||
|
validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
|
||||||
|
Active: podTermination.wantActive,
|
||||||
|
Succeeded: podTermination.wantSucceeded,
|
||||||
|
Failed: podTermination.wantFailed,
|
||||||
|
Ready: pointer.Int32(0),
|
||||||
|
})
|
||||||
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes)
|
||||||
|
if podTermination.wantReplacementPodFailureCount != nil {
|
||||||
|
replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", podTermination.index, err)
|
||||||
|
}
|
||||||
|
gotReplacementPodFailureCount, err := getIndexFailureCount(replacement)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err)
|
||||||
|
}
|
||||||
|
if *podTermination.wantReplacementPodFailureCount != gotReplacementPodFailureCount {
|
||||||
|
t.Fatalf("Unexpected value of the index failure count annotation. Want: %v, got: %v", *podTermination.wantReplacementPodFailureCount, gotReplacementPodFailureCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
remainingActive := test.podTerminations[len(test.podTerminations)-1].wantActive
|
||||||
|
if remainingActive > 0 {
|
||||||
|
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remainingActive); err != nil {
|
||||||
|
t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
|
||||||
|
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getIndexFailureCount(p *v1.Pod) (int, error) {
|
||||||
|
if p.Annotations == nil {
|
||||||
|
return 0, errors.New("no annotations found")
|
||||||
|
}
|
||||||
|
v, ok := p.Annotations[batchv1.JobIndexFailureCountAnnotation]
|
||||||
|
if !ok {
|
||||||
|
return 0, fmt.Errorf("annotation %s not found", batchv1.JobIndexFailureCountAnnotation)
|
||||||
|
}
|
||||||
|
return strconv.Atoi(v)
|
||||||
|
}
|
||||||
|
|
||||||
|
func completionModePtr(cm batchv1.CompletionMode) *batchv1.CompletionMode {
|
||||||
|
return &cm
|
||||||
|
}
|
||||||
|
|
||||||
// TestNonParallelJob tests that a Job that only executes one Pod. The test
|
// TestNonParallelJob tests that a Job that only executes one Pod. The test
|
||||||
// recreates the Job controller at some points to make sure a new controller
|
// recreates the Job controller at some points to make sure a new controller
|
||||||
// is able to pickup.
|
// is able to pickup.
|
||||||
@ -999,7 +1626,7 @@ func TestIndexedJob(t *testing.T) {
|
|||||||
Active: 3,
|
Active: 3,
|
||||||
Ready: pointer.Int32(0),
|
Ready: pointer.Int32(0),
|
||||||
})
|
})
|
||||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "")
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
|
||||||
|
|
||||||
// One Pod succeeds.
|
// One Pod succeeds.
|
||||||
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
|
||||||
@ -1010,7 +1637,7 @@ func TestIndexedJob(t *testing.T) {
|
|||||||
Succeeded: 1,
|
Succeeded: 1,
|
||||||
Ready: pointer.Int32(0),
|
Ready: pointer.Int32(0),
|
||||||
})
|
})
|
||||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1")
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
|
||||||
|
|
||||||
// One Pod fails, which should be recreated.
|
// One Pod fails, which should be recreated.
|
||||||
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
|
if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
|
||||||
@ -1022,7 +1649,7 @@ func TestIndexedJob(t *testing.T) {
|
|||||||
Succeeded: 1,
|
Succeeded: 1,
|
||||||
Ready: pointer.Int32(0),
|
Ready: pointer.Int32(0),
|
||||||
})
|
})
|
||||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1")
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
|
||||||
|
|
||||||
// Remaining Pods succeed.
|
// Remaining Pods succeed.
|
||||||
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
|
if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
|
||||||
@ -1034,7 +1661,7 @@ func TestIndexedJob(t *testing.T) {
|
|||||||
Succeeded: 4,
|
Succeeded: 4,
|
||||||
Ready: pointer.Int32(0),
|
Ready: pointer.Int32(0),
|
||||||
})
|
})
|
||||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3")
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3", nil)
|
||||||
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
||||||
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
|
validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
|
||||||
validateTerminatedPodsTrackingFinalizerMetric(t, 5)
|
validateTerminatedPodsTrackingFinalizerMetric(t, 5)
|
||||||
@ -1208,7 +1835,7 @@ func TestElasticIndexedJob(t *testing.T) {
|
|||||||
Failed: update.wantFailed,
|
Failed: update.wantFailed,
|
||||||
Ready: pointer.Int32(0),
|
Ready: pointer.Int32(0),
|
||||||
})
|
})
|
||||||
validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes)
|
validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
validateJobSucceeded(ctx, t, clientSet, jobObj)
|
||||||
@ -1424,10 +2051,14 @@ func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) {
|
|||||||
if len(jobPods) != 3 {
|
if len(jobPods) != 3 {
|
||||||
t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods))
|
t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods))
|
||||||
}
|
}
|
||||||
|
validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, jobPods)
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateExpotentialBackoffDelay(t *testing.T, defaultPodFailureBackoff time.Duration, pods []*v1.Pod) {
|
||||||
|
t.Helper()
|
||||||
creationTime := []time.Time{}
|
creationTime := []time.Time{}
|
||||||
finishTime := []time.Time{}
|
finishTime := []time.Time{}
|
||||||
for _, pod := range jobPods {
|
for _, pod := range pods {
|
||||||
creationTime = append(creationTime, pod.CreationTimestamp.Time)
|
creationTime = append(creationTime, pod.CreationTimestamp.Time)
|
||||||
if len(pod.Status.ContainerStatuses) > 0 {
|
if len(pod.Status.ContainerStatuses) > 0 {
|
||||||
finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time)
|
finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time)
|
||||||
@ -1441,25 +2072,24 @@ func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) {
|
|||||||
return finishTime[i].Before(finishTime[j])
|
return finishTime[i].Before(finishTime[j])
|
||||||
})
|
})
|
||||||
|
|
||||||
if creationTime[1].Sub(finishTime[0]).Seconds() < jobcontroller.DefaultJobPodFailureBackOff.Seconds() {
|
diff := creationTime[1].Sub(finishTime[0])
|
||||||
t.Fatalf("Second pod should be created at least %v seconds after the first pod", jobcontroller.DefaultJobPodFailureBackOff)
|
|
||||||
|
if diff < defaultPodFailureBackoff {
|
||||||
|
t.Fatalf("Second pod should be created at least %v seconds after the first pod, time difference: %v", defaultPodFailureBackoff, diff)
|
||||||
}
|
}
|
||||||
|
|
||||||
if creationTime[1].Sub(finishTime[0]).Seconds() >= 2*jobcontroller.DefaultJobPodFailureBackOff.Seconds() {
|
if diff >= 2*defaultPodFailureBackoff {
|
||||||
t.Fatalf("Second pod should be created before %v seconds after the first pod", 2*jobcontroller.DefaultJobPodFailureBackOff)
|
t.Fatalf("Second pod should be created before %v seconds after the first pod, time difference: %v", 2*defaultPodFailureBackoff, diff)
|
||||||
}
|
}
|
||||||
|
|
||||||
diff := creationTime[2].Sub(finishTime[1]).Seconds()
|
diff = creationTime[2].Sub(finishTime[1])
|
||||||
|
|
||||||
// The third pod should not be created before 4 seconds
|
if diff < 2*defaultPodFailureBackoff {
|
||||||
if diff < 2*jobcontroller.DefaultJobPodFailureBackOff.Seconds() {
|
t.Fatalf("Third pod should be created at least %v seconds after the second pod, time difference: %v", 2*defaultPodFailureBackoff, diff)
|
||||||
t.Fatalf("Third pod should be created at least %v seconds after the second pod", 2*jobcontroller.DefaultJobPodFailureBackOff)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The third pod should be created within 8 seconds
|
if diff >= 4*defaultPodFailureBackoff {
|
||||||
// This check rules out double counting
|
t.Fatalf("Third pod should be created before %v seconds after the second pod, time difference: %v", 4*defaultPodFailureBackoff, diff)
|
||||||
if diff >= 4*jobcontroller.DefaultJobPodFailureBackOff.Seconds() {
|
|
||||||
t.Fatalf("Third pod should be created before %v seconds after the second pod", 4*jobcontroller.DefaultJobPodFailureBackOff)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1815,7 +2445,7 @@ func validateFinishedPodsNoFinalizer(ctx context.Context, t *testing.T, clientSe
|
|||||||
// validateIndexedJobPods validates indexes and hostname of
|
// validateIndexedJobPods validates indexes and hostname of
|
||||||
// active and completed Pods of an Indexed Job.
|
// active and completed Pods of an Indexed Job.
|
||||||
// Call after validateJobPodsStatus
|
// Call after validateJobPodsStatus
|
||||||
func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string) {
|
func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string, wantFailed *string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
|
updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -1824,6 +2454,9 @@ func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clients
|
|||||||
if updatedJob.Status.CompletedIndexes != gotCompleted {
|
if updatedJob.Status.CompletedIndexes != gotCompleted {
|
||||||
t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted)
|
t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted)
|
||||||
}
|
}
|
||||||
|
if diff := cmp.Diff(wantFailed, updatedJob.Status.FailedIndexes); diff != "" {
|
||||||
|
t.Errorf("Got unexpected failed indexes: %s", diff)
|
||||||
|
}
|
||||||
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
|
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Failed to list Job Pods: %v", err)
|
t.Fatalf("Failed to list Job Pods: %v", err)
|
||||||
@ -2005,6 +2638,17 @@ func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, job
|
|||||||
}
|
}
|
||||||
if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
|
if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
|
||||||
pod.Status.Phase = phase
|
pod.Status.Phase = phase
|
||||||
|
if phase == v1.PodFailed || phase == v1.PodSucceeded {
|
||||||
|
pod.Status.ContainerStatuses = []v1.ContainerStatus{
|
||||||
|
{
|
||||||
|
State: v1.ContainerState{
|
||||||
|
Terminated: &v1.ContainerStateTerminated{
|
||||||
|
FinishedAt: metav1.Now(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{})
|
_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("updating pod %s status: %w", pod.Name, err)
|
return fmt.Errorf("updating pod %s status: %w", pod.Name, err)
|
||||||
@ -2015,6 +2659,44 @@ func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, job
|
|||||||
return errors.New("no pod matching index found")
|
return errors.New("no pod matching index found")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getActivePodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int) (*v1.Pod, error) {
|
||||||
|
return getJobPodForIndex(ctx, clientSet, jobObj, ix, func(p *v1.Pod) bool {
|
||||||
|
return !podutil.IsPodTerminal(p)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func getJobPodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) (*v1.Pod, error) {
|
||||||
|
pods, err := getJobPodsForIndex(ctx, clientSet, jobObj, ix, filter)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(pods) == 0 {
|
||||||
|
return nil, fmt.Errorf("Pod not found for index: %v", ix)
|
||||||
|
}
|
||||||
|
return pods[0], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getJobPodsForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) ([]*v1.Pod, error) {
|
||||||
|
pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("listing Job Pods: %w", err)
|
||||||
|
}
|
||||||
|
var result []*v1.Pod
|
||||||
|
for _, pod := range pods.Items {
|
||||||
|
pod := pod
|
||||||
|
if !metav1.IsControlledBy(&pod, jobObj) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !filter(&pod) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
|
||||||
|
result = append(result, &pod)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
func getCompletionIndex(p *v1.Pod) (int, error) {
|
func getCompletionIndex(p *v1.Pod) (int, error) {
|
||||||
if p.Annotations == nil {
|
if p.Annotations == nil {
|
||||||
return 0, errors.New("no annotations found")
|
return 0, errors.New("no annotations found")
|
||||||
|
Loading…
Reference in New Issue
Block a user