mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 11:21:47 +00:00
update prev succeeded indexes for indexed jobs unconditionally
This commit is contained in:
parent
6f22d1f1ab
commit
2a81337e7c
@ -52,7 +52,7 @@ type orderedIntervals []interval
|
|||||||
// empty list if this Job is not tracked with finalizers. The new list includes
|
// empty list if this Job is not tracked with finalizers. The new list includes
|
||||||
// the indexes that succeeded since the last sync.
|
// the indexes that succeeded since the last sync.
|
||||||
func calculateSucceededIndexes(job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) {
|
func calculateSucceededIndexes(job *batch.Job, pods []*v1.Pod) (orderedIntervals, orderedIntervals) {
|
||||||
prevIntervals := succeededIndexesFromJob(job)
|
prevIntervals := succeededIndexesFromString(job.Status.CompletedIndexes, int(*job.Spec.Completions))
|
||||||
newSucceeded := sets.NewInt()
|
newSucceeded := sets.NewInt()
|
||||||
for _, p := range pods {
|
for _, p := range pods {
|
||||||
ix := getCompletionIndex(p.Annotations)
|
ix := getCompletionIndex(p.Annotations)
|
||||||
@ -148,20 +148,19 @@ func (oi orderedIntervals) has(ix int) bool {
|
|||||||
return oi[hi].First <= ix
|
return oi[hi].First <= ix
|
||||||
}
|
}
|
||||||
|
|
||||||
func succeededIndexesFromJob(job *batch.Job) orderedIntervals {
|
func succeededIndexesFromString(completedIndexes string, completions int) orderedIntervals {
|
||||||
if job.Status.CompletedIndexes == "" {
|
if completedIndexes == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
var result orderedIntervals
|
var result orderedIntervals
|
||||||
var lastInterval *interval
|
var lastInterval *interval
|
||||||
completions := int(*job.Spec.Completions)
|
for _, intervalStr := range strings.Split(completedIndexes, ",") {
|
||||||
for _, intervalStr := range strings.Split(job.Status.CompletedIndexes, ",") {
|
|
||||||
limitsStr := strings.Split(intervalStr, "-")
|
limitsStr := strings.Split(intervalStr, "-")
|
||||||
var inter interval
|
var inter interval
|
||||||
var err error
|
var err error
|
||||||
inter.First, err = strconv.Atoi(limitsStr[0])
|
inter.First, err = strconv.Atoi(limitsStr[0])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.InfoS("Corrupted completed indexes interval, ignoring", "job", klog.KObj(job), "interval", intervalStr, "err", err)
|
klog.InfoS("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if inter.First >= completions {
|
if inter.First >= completions {
|
||||||
@ -170,7 +169,7 @@ func succeededIndexesFromJob(job *batch.Job) orderedIntervals {
|
|||||||
if len(limitsStr) > 1 {
|
if len(limitsStr) > 1 {
|
||||||
inter.Last, err = strconv.Atoi(limitsStr[1])
|
inter.Last, err = strconv.Atoi(limitsStr[1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.InfoS("Corrupted completed indexes interval, ignoring", "job", klog.KObj(job), "interval", intervalStr, "err", err)
|
klog.InfoS("Corrupted completed indexes interval, ignoring", "interval", intervalStr, "err", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if inter.Last >= completions {
|
if inter.Last >= completions {
|
||||||
|
@ -968,6 +968,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
|||||||
uidsWithFinalizer.Insert(uid)
|
uidsWithFinalizer.Insert(uid)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Shallow copy, as it will only be used to detect changes in the counters.
|
// Shallow copy, as it will only be used to detect changes in the counters.
|
||||||
oldCounters := job.Status
|
oldCounters := job.Status
|
||||||
if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) {
|
if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) {
|
||||||
@ -1041,10 +1042,14 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(newSucceededIndexes) > 0 {
|
if isIndexed {
|
||||||
succeededIndexes = succeededIndexes.withOrderedIndexes(newSucceededIndexes)
|
succeededIndexes = succeededIndexes.withOrderedIndexes(newSucceededIndexes)
|
||||||
|
succeededIndexesStr := succeededIndexes.String()
|
||||||
|
if succeededIndexesStr != job.Status.CompletedIndexes {
|
||||||
|
needsFlush = true
|
||||||
|
}
|
||||||
job.Status.Succeeded = int32(succeededIndexes.total())
|
job.Status.Succeeded = int32(succeededIndexes.total())
|
||||||
job.Status.CompletedIndexes = succeededIndexes.String()
|
job.Status.CompletedIndexes = succeededIndexesStr
|
||||||
}
|
}
|
||||||
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
|
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
|
||||||
if finishedCond != nil && finishedCond.Type == batch.JobFailureTarget {
|
if finishedCond != nil && finishedCond.Type == batch.JobFailureTarget {
|
||||||
@ -1680,7 +1685,7 @@ func isPodFailed(p *v1.Pod, job *batch.Job) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
// Count deleted Pods as failures to account for orphan Pods that
|
// Count deleted Pods as failures to account for orphan Pods that
|
||||||
// never have a chance to reach the Failed phase.
|
// never have a chance to reach the Failed phase.
|
||||||
return p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
|
return p.DeletionTimestamp != nil && p.Status.Phase != v1.PodSucceeded
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1695,9 +1700,24 @@ func findConditionByType(list []batch.JobCondition, cType batch.JobConditionType
|
|||||||
|
|
||||||
func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) {
|
func recordJobPodFinished(job *batch.Job, oldCounters batch.JobStatus) {
|
||||||
completionMode := completionModeStr(job)
|
completionMode := completionModeStr(job)
|
||||||
diff := job.Status.Succeeded - oldCounters.Succeeded
|
var diff int
|
||||||
|
|
||||||
|
// Updating succeeded metric must be handled differently
|
||||||
|
// for Indexed Jobs to handle the case where the job has
|
||||||
|
// been scaled down by reducing completions & parallelism
|
||||||
|
// in tandem, and now a previously completed index is
|
||||||
|
// now out of range (i.e. index >= spec.Completions).
|
||||||
|
if isIndexedJob(job) {
|
||||||
|
if job.Status.CompletedIndexes != oldCounters.CompletedIndexes {
|
||||||
|
diff = succeededIndexesFromString(job.Status.CompletedIndexes, int(*job.Spec.Completions)).total() - succeededIndexesFromString(oldCounters.CompletedIndexes, int(*job.Spec.Completions)).total()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
diff = int(job.Status.Succeeded) - int(oldCounters.Succeeded)
|
||||||
|
}
|
||||||
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff))
|
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded).Add(float64(diff))
|
||||||
diff = job.Status.Failed - oldCounters.Failed
|
|
||||||
|
// Update failed metric.
|
||||||
|
diff = int(job.Status.Failed - oldCounters.Failed)
|
||||||
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
|
metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed).Add(float64(diff))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1043,16 +1043,18 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
indexedCompletion := batch.IndexedCompletion
|
indexedCompletion := batch.IndexedCompletion
|
||||||
mockErr := errors.New("mock error")
|
mockErr := errors.New("mock error")
|
||||||
cases := map[string]struct {
|
cases := map[string]struct {
|
||||||
job batch.Job
|
job batch.Job
|
||||||
pods []*v1.Pod
|
pods []*v1.Pod
|
||||||
finishedCond *batch.JobCondition
|
finishedCond *batch.JobCondition
|
||||||
expectedRmFinalizers sets.String
|
expectedRmFinalizers sets.String
|
||||||
needsFlush bool
|
needsFlush bool
|
||||||
statusUpdateErr error
|
statusUpdateErr error
|
||||||
podControlErr error
|
podControlErr error
|
||||||
wantErr error
|
wantErr error
|
||||||
wantRmFinalizers int
|
wantRmFinalizers int
|
||||||
wantStatusUpdates []batch.JobStatus
|
wantStatusUpdates []batch.JobStatus
|
||||||
|
wantSucceededPodsMetric int
|
||||||
|
wantFailedPodsMetric int
|
||||||
}{
|
}{
|
||||||
"no updates": {},
|
"no updates": {},
|
||||||
"new active": {
|
"new active": {
|
||||||
@ -1093,6 +1095,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Failed: 3,
|
Failed: 3,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 2,
|
||||||
|
wantFailedPodsMetric: 3,
|
||||||
},
|
},
|
||||||
"past and new finished pods": {
|
"past and new finished pods": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1133,6 +1137,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Failed: 6,
|
Failed: 6,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 3,
|
||||||
|
wantFailedPodsMetric: 3,
|
||||||
},
|
},
|
||||||
"expecting removed finalizers": {
|
"expecting removed finalizers": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1172,6 +1178,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Failed: 6,
|
Failed: 6,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 3,
|
||||||
|
wantFailedPodsMetric: 3,
|
||||||
},
|
},
|
||||||
"succeeding job": {
|
"succeeding job": {
|
||||||
pods: []*v1.Pod{
|
pods: []*v1.Pod{
|
||||||
@ -1195,6 +1203,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
CompletionTime: &succeededCond.LastTransitionTime,
|
CompletionTime: &succeededCond.LastTransitionTime,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 1,
|
||||||
|
wantFailedPodsMetric: 1,
|
||||||
},
|
},
|
||||||
"failing job": {
|
"failing job": {
|
||||||
pods: []*v1.Pod{
|
pods: []*v1.Pod{
|
||||||
@ -1219,6 +1229,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Conditions: []batch.JobCondition{*failedCond},
|
Conditions: []batch.JobCondition{*failedCond},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 1,
|
||||||
|
wantFailedPodsMetric: 2,
|
||||||
},
|
},
|
||||||
"deleted job": {
|
"deleted job": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1251,6 +1263,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Failed: 1,
|
Failed: 1,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 1,
|
||||||
|
wantFailedPodsMetric: 1,
|
||||||
},
|
},
|
||||||
"status update error": {
|
"status update error": {
|
||||||
pods: []*v1.Pod{
|
pods: []*v1.Pod{
|
||||||
@ -1339,6 +1353,62 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 2,
|
||||||
|
},
|
||||||
|
"indexed job prev successful pods outside current completions index range with no new succeeded pods": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
CompletionMode: &indexedCompletion,
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
},
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
Active: 2,
|
||||||
|
Succeeded: 1,
|
||||||
|
CompletedIndexes: "3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod,
|
||||||
|
buildPod().phase(v1.PodRunning).trackingFinalizer().index("1").Pod,
|
||||||
|
},
|
||||||
|
wantRmFinalizers: 0,
|
||||||
|
wantStatusUpdates: []batch.JobStatus{
|
||||||
|
{
|
||||||
|
Active: 2,
|
||||||
|
Succeeded: 0,
|
||||||
|
CompletedIndexes: "",
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"indexed job prev successful pods outside current completions index range with new succeeded pods in range": {
|
||||||
|
job: batch.Job{
|
||||||
|
Spec: batch.JobSpec{
|
||||||
|
CompletionMode: &indexedCompletion,
|
||||||
|
Completions: pointer.Int32(2),
|
||||||
|
Parallelism: pointer.Int32(2),
|
||||||
|
},
|
||||||
|
Status: batch.JobStatus{
|
||||||
|
Active: 2,
|
||||||
|
Succeeded: 1,
|
||||||
|
CompletedIndexes: "3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
pods: []*v1.Pod{
|
||||||
|
buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod,
|
||||||
|
buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod,
|
||||||
|
},
|
||||||
|
wantRmFinalizers: 1,
|
||||||
|
wantStatusUpdates: []batch.JobStatus{
|
||||||
|
{
|
||||||
|
Active: 2,
|
||||||
|
Succeeded: 1,
|
||||||
|
CompletedIndexes: "1",
|
||||||
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
wantSucceededPodsMetric: 1,
|
||||||
},
|
},
|
||||||
"indexed job new failed pods": {
|
"indexed job new failed pods": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1371,6 +1441,7 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantFailedPodsMetric: 3,
|
||||||
},
|
},
|
||||||
"indexed job past and new pods": {
|
"indexed job past and new pods": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1409,6 +1480,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 1,
|
||||||
|
wantFailedPodsMetric: 2,
|
||||||
},
|
},
|
||||||
"too many finished": {
|
"too many finished": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1449,6 +1522,8 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Failed: 1,
|
Failed: 1,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 499,
|
||||||
|
wantFailedPodsMetric: 1,
|
||||||
},
|
},
|
||||||
"too many indexed finished": {
|
"too many indexed finished": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1472,6 +1547,7 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Succeeded: 500,
|
Succeeded: 500,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantSucceededPodsMetric: 500,
|
||||||
},
|
},
|
||||||
"pod flips from failed to succeeded": {
|
"pod flips from failed to succeeded": {
|
||||||
job: batch.Job{
|
job: batch.Job{
|
||||||
@ -1498,6 +1574,7 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
Conditions: []batch.JobCondition{*failedCond},
|
Conditions: []batch.JobCondition{*failedCond},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
wantFailedPodsMetric: 2,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
for name, tc := range cases {
|
for name, tc := range cases {
|
||||||
@ -1517,7 +1594,10 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
|
job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
|
||||||
}
|
}
|
||||||
uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods)
|
uncounted := newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods)
|
||||||
succeededIndexes := succeededIndexesFromJob(job)
|
var succeededIndexes orderedIntervals
|
||||||
|
if isIndexedJob(job) {
|
||||||
|
succeededIndexes = succeededIndexesFromString(job.Status.CompletedIndexes, int(*job.Spec.Completions))
|
||||||
|
}
|
||||||
err := manager.trackJobStatusAndRemoveFinalizers(context.TODO(), job, tc.pods, succeededIndexes, *uncounted, tc.expectedRmFinalizers, tc.finishedCond, tc.needsFlush)
|
err := manager.trackJobStatusAndRemoveFinalizers(context.TODO(), job, tc.pods, succeededIndexes, *uncounted, tc.expectedRmFinalizers, tc.finishedCond, tc.needsFlush)
|
||||||
if !errors.Is(err, tc.wantErr) {
|
if !errors.Is(err, tc.wantErr) {
|
||||||
t.Errorf("Got error %v, want %v", err, tc.wantErr)
|
t.Errorf("Got error %v, want %v", err, tc.wantErr)
|
||||||
@ -1535,17 +1615,15 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err)
|
t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err)
|
||||||
}
|
}
|
||||||
newSucceeded := job.Status.Succeeded - tc.job.Status.Succeeded
|
if float64(tc.wantSucceededPodsMetric) != v {
|
||||||
if float64(newSucceeded) != v {
|
t.Errorf("Metric reports %.0f succeeded pods, want %d", v, tc.wantSucceededPodsMetric)
|
||||||
t.Errorf("Metric reports %.0f succeeded pods, want %d", v, newSucceeded)
|
|
||||||
}
|
}
|
||||||
v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed))
|
v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("Obtaining failed job_pods_finished_total: %v", err)
|
t.Fatalf("Obtaining failed job_pods_finished_total: %v", err)
|
||||||
}
|
}
|
||||||
newFailed := job.Status.Failed - tc.job.Status.Failed
|
if float64(tc.wantFailedPodsMetric) != v {
|
||||||
if float64(newFailed) != v {
|
t.Errorf("Metric reports %.0f failed pods, want %d", v, tc.wantFailedPodsMetric)
|
||||||
t.Errorf("Metric reports %.0f failed pods, want %d", v, newFailed)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user