Merge pull request #115349 from danielvegamyhre/job-controller-changes

Update previous succeeded indexes for Indexed jobs unconditionally
This commit is contained in:
Kubernetes Prow Robot 2023-01-31 15:51:04 -08:00 committed by GitHub
commit bd63a912d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 126 additions and 29 deletions

View File

@ -52,7 +52,7 @@ type orderedIntervals []interval
// empty list if this Job is not tracked with finalizers. The new list includes // empty list if this Job is not tracked with finalizers. The new list includes
// the indexes that succeeded since the last sync. // the indexes that succeeded since the last sync.
func calculateSucceededIndexes(job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) { func calculateSucceededIndexes(job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) {
prevIntervals := succeededIndexesFromJob(job) prevIntervals := succeededIndexesFromString(job.Status.CompletedIndexes, int(*job.Spec.Completions))
newSucceeded := sets.NewInt() newSucceeded := sets.NewInt()
for _, p := range pods { for _, p := range pods {
ix := getCompletionIndex(p.Annotations) ix := getCompletionIndex(p.Annotations)
@ -148,20 +148,19 @@ func (oi orderedIntervals) has(ix int) bool {
return oi[hi].First <= ix return oi[hi].First <= ix
} }
func succeededIndexesFromJob(job *batch.Job) orderedIntervals { func succeededIndexesFromString(completedIndexes string, completions int) orderedIntervals {
if job.Status.CompletedIndexes == "" { if completedIndexes == "" {
return nil return nil
} }
var result orderedIntervals var result orderedIntervals
var lastInterval *interval var lastInterval *interval
completions := int(*job.Spec.Completions) for _, intervalStr := range strings.Split(completedIndexes, ",") {
for _, intervalStr := range strings.Split(job.Status.CompletedIndexes, ",") {
limitsStr := strings.Split(intervalStr, "-") limitsStr := strings.Split(intervalStr, "-")
var inter interval var inter interval
var err error var err error
inter.First, err = strconv.Atoi(limitsStr[0]) inter.First, err = strconv.Atoi(limitsStr[0])
if err != nil { if err != nil {
klog.InfoS("Corrupted completed indexes interval, ignoring", "job", klog.KObj(job), "interval", intervalStr, "err", err) klog.InfoS("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err)
continue continue
} }
if inter.First >= completions { if inter.First >= completions {
@ -170,7 +169,7 @@ func succeededIndexesFromJob(job *batch.Job) orderedIntervals {
if len(limitsStr) > 1 { if len(limitsStr) > 1 {
inter.Last, err = strconv.Atoi(limitsStr[1]) inter.Last, err = strconv.Atoi(limitsStr[1])
if err != nil { if err != nil {
klog.InfoS("Corrupted completed indexes interval, ignoring", "job", klog.KObj(job), "interval", intervalStr, "err", err) klog.InfoS("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err)
continue continue
} }
if inter.Last >= completions { if inter.Last >= completions {

View File

@ -968,6 +968,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
uidsWithFinalizer.Insert(uid) uidsWithFinalizer.Insert(uid)
} }
} }
// Shallow copy, as it will only be used to detect changes in the counters. // Shallow copy, as it will only be used to detect changes in the counters.
oldCounters := job.Status oldCounters := job.Status
if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) { if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) {
@ -1041,10 +1042,14 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
break break
} }
} }
if len(newSucceededIndexes) > 0 { if isIndexed {
succeededIndexes = succeededIndexes.withOrderedIndexes(newSucceededIndexes) succeededIndexes = succeededIndexes.withOrderedIndexes(newSucceededIndexes)
succeededIndexesStr := succeededIndexes.String()
if succeededIndexesStr != job.Status.CompletedIndexes {
needsFlush = true
}
job.Status.Succeeded = int32(succeededIndexes.total()) job.Status.Succeeded = int32(succeededIndexes.total())
job.Status.CompletedIndexes = succeededIndexes.String() job.Status.CompletedIndexes = succeededIndexesStr
} }
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
if finishedCond != nil && finishedCond.Type == batch.JobFailureTarget { if finishedCond != nil && finishedCond.Type == batch.JobFailureTarget {
@ -1680,7 +1685,7 @@ func isPodFailed(p *v1.Pod, job *batch.Job) bool {
return true return true
} }
// Count deleted Pods as failures to account for orphan Pods that // Count deleted Pods as failures to account for orphan Pods that
// never have a chance to reach the Failed phase. // never have a chance to reach the Failed phase.
return p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded return p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
} }
@ -1695,9 +1700,24 @@ func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType
func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) { func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) {
completionMode := completionModeStr(job) completionMode := completionModeStr(job)
diff := job.Status.Succeeded - oldCounters.Succeeded var diff int
// Updating succeeded metric must be handled differently
// for Indexed Jobs to handle the case where the job has
// been scaled down by reducing completions & parallelism
// in tandem, and now a previously completed index is
// now out of range (i.e. index >= spec.Completions).
if isIndexedJob(job) {
if job.Status.CompletedIndexes != oldCounters.CompletedIndexes {
diff = succeededIndexesFromString(job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - succeededIndexesFromString(oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total()
}
} else {
diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded)
}
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff)) metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff))
diff = job.Status.Failed - oldCounters.Failed
// Update failed metric.
diff = int(job.Status.Failed - oldCounters.Failed)
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff)) metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
} }

View File

@ -1043,16 +1043,18 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
indexedCompletion := batch.IndexedCompletion indexedCompletion := batch.IndexedCompletion
mockErr := errors.New("mock error") mockErr := errors.New("mock error")
cases := map[string]struct { cases := map[string]struct {
job batch.Job job batch.Job
pods []*v1.Pod pods []*v1.Pod
finishedCond *batch.JobCondition finishedCond *batch.JobCondition
expectedRmFinalizers sets.String expectedRmFinalizers sets.String
needsFlush bool needsFlush bool
statusUpdateErr error statusUpdateErr error
podControlErr error podControlErr error
wantErr error wantErr error
wantRmFinalizers int wantRmFinalizers int
wantStatusUpdates []batch.JobStatus wantStatusUpdates []batch.JobStatus
wantSucceededPodsMetric int
wantFailedPodsMetric int
}{ }{
"no updates": {}, "no updates": {},
"new active": { "new active": {
@ -1093,6 +1095,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Failed: 3, Failed: 3,
}, },
}, },
wantSucceededPodsMetric: 2,
wantFailedPodsMetric: 3,
}, },
"past and new finished pods": { "past and new finished pods": {
job: batch.Job{ job: batch.Job{
@ -1133,6 +1137,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Failed: 6, Failed: 6,
}, },
}, },
wantSucceededPodsMetric: 3,
wantFailedPodsMetric: 3,
}, },
"expecting removed finalizers": { "expecting removed finalizers": {
job: batch.Job{ job: batch.Job{
@ -1172,6 +1178,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Failed: 6, Failed: 6,
}, },
}, },
wantSucceededPodsMetric: 3,
wantFailedPodsMetric: 3,
}, },
"succeeding job": { "succeeding job": {
pods: []*v1.Pod{ pods: []*v1.Pod{
@ -1195,6 +1203,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
CompletionTime: &succeededCond.LastTransitionTime, CompletionTime: &succeededCond.LastTransitionTime,
}, },
}, },
wantSucceededPodsMetric: 1,
wantFailedPodsMetric: 1,
}, },
"failing job": { "failing job": {
pods: []*v1.Pod{ pods: []*v1.Pod{
@ -1219,6 +1229,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Conditions: []batch.JobCondition{*failedCond}, Conditions: []batch.JobCondition{*failedCond},
}, },
}, },
wantSucceededPodsMetric: 1,
wantFailedPodsMetric: 2,
}, },
"deleted job": { "deleted job": {
job: batch.Job{ job: batch.Job{
@ -1251,6 +1263,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Failed: 1, Failed: 1,
}, },
}, },
wantSucceededPodsMetric: 1,
wantFailedPodsMetric: 1,
}, },
"status update error": { "status update error": {
pods: []*v1.Pod{ pods: []*v1.Pod{
@ -1339,6 +1353,62 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
}, },
}, },
wantSucceededPodsMetric: 2,
},
"indexed job prev successful pods outside current completions index range with no new succeeded pods": {
job: batch.Job{
Spec: batch.JobSpec{
CompletionMode: &indexedCompletion,
Completions: pointer.Int32(2),
Parallelism: pointer.Int32(2),
},
Status: batch.JobStatus{
Active: 2,
Succeeded: 1,
CompletedIndexes: "3",
},
},
pods: []*v1.Pod{
buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod,
buildPod().phase(v1.PodRunning).trackingFinalizer().index("1").Pod,
},
wantRmFinalizers: 0,
wantStatusUpdates: []batch.JobStatus{
{
Active: 2,
Succeeded: 0,
CompletedIndexes: "",
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
},
},
},
"indexed job prev successful pods outside current completions index range with new succeeded pods in range": {
job: batch.Job{
Spec: batch.JobSpec{
CompletionMode: &indexedCompletion,
Completions: pointer.Int32(2),
Parallelism: pointer.Int32(2),
},
Status: batch.JobStatus{
Active: 2,
Succeeded: 1,
CompletedIndexes: "3",
},
},
pods: []*v1.Pod{
buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod,
buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod,
},
wantRmFinalizers: 1,
wantStatusUpdates: []batch.JobStatus{
{
Active: 2,
Succeeded: 1,
CompletedIndexes: "1",
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
},
},
wantSucceededPodsMetric: 1,
}, },
"indexed job new failed pods": { "indexed job new failed pods": {
job: batch.Job{ job: batch.Job{
@ -1371,6 +1441,7 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
}, },
}, },
wantFailedPodsMetric: 3,
}, },
"indexed job past and new pods": { "indexed job past and new pods": {
job: batch.Job{ job: batch.Job{
@ -1409,6 +1480,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
}, },
}, },
wantSucceededPodsMetric: 1,
wantFailedPodsMetric: 2,
}, },
"too many finished": { "too many finished": {
job: batch.Job{ job: batch.Job{
@ -1449,6 +1522,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Failed: 1, Failed: 1,
}, },
}, },
wantSucceededPodsMetric: 499,
wantFailedPodsMetric: 1,
}, },
"too many indexed finished": { "too many indexed finished": {
job: batch.Job{ job: batch.Job{
@ -1472,6 +1547,7 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Succeeded: 500, Succeeded: 500,
}, },
}, },
wantSucceededPodsMetric: 500,
}, },
"pod flips from failed to succeeded": { "pod flips from failed to succeeded": {
job: batch.Job{ job: batch.Job{
@ -1498,6 +1574,7 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
Conditions: []batch.JobCondition{*failedCond}, Conditions: []batch.JobCondition{*failedCond},
}, },
}, },
wantFailedPodsMetric: 2,
}, },
} }
for name, tc := range cases { for name, tc := range cases {
@ -1517,7 +1594,10 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{} job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
} }
uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods) uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods)
succeededIndexes := succeededIndexesFromJob(job) var succeededIndexes orderedIntervals
if isIndexedJob(job) {
succeededIndexes = succeededIndexesFromString(job.Status.CompletedIndexes, int(*job.Spec.Completions))
}
err := manager.trackJobStatusAndRemoveFinalizers(context.TODO(), job, tc.pods, succeededIndexes, *uncounted, tc.expectedRmFinalizers, tc.finishedCond, tc.needsFlush) err := manager.trackJobStatusAndRemoveFinalizers(context.TODO(), job, tc.pods, succeededIndexes, *uncounted, tc.expectedRmFinalizers, tc.finishedCond, tc.needsFlush)
if !errors.Is(err, tc.wantErr) { if !errors.Is(err, tc.wantErr) {
t.Errorf("Got error %v, want %v", err, tc.wantErr) t.Errorf("Got error %v, want %v", err, tc.wantErr)
@ -1535,17 +1615,15 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err) t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err)
} }
newSucceeded := job.Status.Succeeded - tc.job.Status.Succeeded if float64(tc.wantSucceededPodsMetric) != v {
if float64(newSucceeded) != v { t.Errorf("Metric reports %.0f succeeded pods, want %d", v, tc.wantSucceededPodsMetric)
t.Errorf("Metric reports %.0f succeeded pods, want %d", v, newSucceeded)
} }
v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed)) v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed))
if err != nil { if err != nil {
t.Fatalf("Obtaining failed job_pods_finished_total: %v", err) t.Fatalf("Obtaining failed job_pods_finished_total: %v", err)
} }
newFailed := job.Status.Failed - tc.job.Status.Failed if float64(tc.wantFailedPodsMetric) != v {
if float64(newFailed) != v { t.Errorf("Metric reports %.0f failed pods, want %d", v, tc.wantFailedPodsMetric)
t.Errorf("Metric reports %.0f failed pods, want %d", v, newFailed)
} }
} }
}) })