Merge pull request #130061 from mimowo/make-backoffperindex-stable

Graduate Backoff Limit Per Index as stable
This commit is contained in:
Kubernetes Prow Robot 2025-02-28 13:37:02 -08:00 committed by GitHub
commit d04883c90c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 106 additions and 48 deletions

View File

@ -4690,7 +4690,7 @@
"type": "integer"
},
"backoffLimitPerIndex": {
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable.",
"format": "int32",
"type": "integer"
},
@ -4712,7 +4712,7 @@
"type": "boolean"
},
"maxFailedIndexes": {
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5.",
"format": "int32",
"type": "integer"
},
@ -4788,7 +4788,7 @@
"type": "integer"
},
"failedIndexes": {
"description": "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.\n\nThis field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"description": "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.",
"type": "string"
},
"ready": {
@ -4897,7 +4897,7 @@
"description": "PodFailurePolicyRule describes how a pod failure is handled when the requirements are met. One of onExitCodes and onPodConditions, but not both, can be used in each rule.",
"properties": {
"action": {
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is beta-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"type": "string"
},
"onExitCodes": {

View File

@ -331,7 +331,7 @@
"type": "integer"
},
"backoffLimitPerIndex": {
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable.",
"format": "int32",
"type": "integer"
},
@ -353,7 +353,7 @@
"type": "boolean"
},
"maxFailedIndexes": {
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5.",
"format": "int32",
"type": "integer"
},
@ -455,7 +455,7 @@
"type": "integer"
},
"failedIndexes": {
"description": "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.\n\nThis field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"description": "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.",
"type": "string"
},
"ready": {
@ -592,7 +592,7 @@
"properties": {
"action": {
"default": "",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is beta-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"type": "string"
},
"onExitCodes": {

View File

@ -132,7 +132,6 @@ const (
// This is an action which might be taken on a pod failure - mark the
// Job's index as failed to avoid restarts within this index. This action
// can only be used when backoffLimitPerIndex is set.
// This value is beta-level.
PodFailurePolicyActionFailIndex PodFailurePolicyAction = "FailIndex"
// This is an action which might be taken on a pod failure - the counter towards
@ -226,8 +225,6 @@ type PodFailurePolicyRule struct {
// running pods are terminated.
// - FailIndex: indicates that the pod's index is marked as Failed and will
// not be restarted.
// This value is beta-level. It can be used when the
// `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).
// - Ignore: indicates that the counter towards the .backoffLimit is not
// incremented and a replacement pod is created.
// - Count: indicates that the pod is handled in the default way - the
@ -363,8 +360,6 @@ type JobSpec struct {
// batch.kubernetes.io/job-index-failure-count annotation. It can only
// be set when Job's completionMode=Indexed, and the Pod's restart
// policy is Never. The field is immutable.
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
BackoffLimitPerIndex *int32
@ -376,8 +371,6 @@ type JobSpec struct {
// It can only be specified when backoffLimitPerIndex is set.
// It can be null or up to completions. It is required and must be
// less than or equal to 10^4 when is completions greater than 10^5.
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
MaxFailedIndexes *int32
@ -571,8 +564,6 @@ type JobStatus struct {
// represented as "1,3-5,7".
// The set of failed indexes cannot overlap with the set of completed indexes.
//
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
FailedIndexes *string

View File

@ -2437,6 +2437,10 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
}
for name, tc := range cases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, tc.enableJobSuccessPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)
@ -5171,6 +5175,10 @@ func TestSyncJobWithJobSuccessPolicy(t *testing.T) {
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
if !tc.enableBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableBackoffLimitPerIndex)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, tc.enableJobSuccessPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)
@ -5849,6 +5857,10 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobManagedBy, tc.enableJobManagedBy)

View File

@ -23,6 +23,7 @@ import (
batch "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilversion "k8s.io/apimachinery/pkg/util/version"
utilfeature "k8s.io/apiserver/pkg/util/feature"
featuregatetesting "k8s.io/component-base/featuregate/testing"
_ "k8s.io/kubernetes/pkg/apis/core/install"
@ -867,6 +868,10 @@ func TestMatchPodFailurePolicy(t *testing.T) {
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
jobFailMessage, countFailed, action := matchPodFailurePolicy(tc.podFailurePolicy, tc.failedPod)
if diff := cmp.Diff(tc.wantJobFailureMessage, jobFailMessage); diff != "" {

View File

@ -415,6 +415,7 @@ var defaultVersionedKubernetesFeatureGates = map[featuregate.Feature]featuregate
JobBackoffLimitPerIndex: {
{Version: version.MustParse("1.28"), Default: false, PreRelease: featuregate.Alpha},
{Version: version.MustParse("1.29"), Default: true, PreRelease: featuregate.Beta},
{Version: version.MustParse("1.33"), Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.36
},
JobManagedBy: {

View File

@ -17731,14 +17731,14 @@ func schema_k8sio_api_batch_v1_JobSpec(ref common.ReferenceCallback) common.Open
},
"backoffLimitPerIndex": {
SchemaProps: spec.SchemaProps{
Description: "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
Description: "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable.",
Type: []string{"integer"},
Format: "int32",
},
},
"maxFailedIndexes": {
SchemaProps: spec.SchemaProps{
Description: "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
Description: "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5.",
Type: []string{"integer"},
Format: "int32",
},
@ -17886,7 +17886,7 @@ func schema_k8sio_api_batch_v1_JobStatus(ref common.ReferenceCallback) common.Op
},
"failedIndexes": {
SchemaProps: spec.SchemaProps{
Description: "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.\n\nThis field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
Description: "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.",
Type: []string{"string"},
Format: "",
},
@ -18065,7 +18065,7 @@ func schema_k8sio_api_batch_v1_PodFailurePolicyRule(ref common.ReferenceCallback
Properties: map[string]spec.Schema{
"action": {
SchemaProps: spec.SchemaProps{
Description: "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is beta-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.\n\nPossible enum values:\n - `\"Count\"` This is an action which might be taken on a pod failure - the pod failure is handled in the default way - the counter towards .backoffLimit, represented by the job's .status.failed field, is incremented.\n - `\"FailIndex\"` This is an action which might be taken on a pod failure - mark the Job's index as failed to avoid restarts within this index. This action can only be used when backoffLimitPerIndex is set. This value is beta-level.\n - `\"FailJob\"` This is an action which might be taken on a pod failure - mark the pod's job as Failed and terminate all running pods.\n - `\"Ignore\"` This is an action which might be taken on a pod failure - the counter towards .backoffLimit, represented by the job's .status.failed field, is not incremented and a replacement pod is created.",
Description: "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.\n\nPossible enum values:\n - `\"Count\"` This is an action which might be taken on a pod failure - the pod failure is handled in the default way - the counter towards .backoffLimit, represented by the job's .status.failed field, is incremented.\n - `\"FailIndex\"` This is an action which might be taken on a pod failure - mark the Job's index as failed to avoid restarts within this index. This action can only be used when backoffLimitPerIndex is set.\n - `\"FailJob\"` This is an action which might be taken on a pod failure - mark the pod's job as Failed and terminate all running pods.\n - `\"Ignore\"` This is an action which might be taken on a pod failure - the counter towards .backoffLimit, represented by the job's .status.failed field, is not incremented and a replacement pod is created.",
Default: "",
Type: []string{"string"},
Format: "",

View File

@ -26,6 +26,7 @@ import (
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/validation/field"
utilversion "k8s.io/apimachinery/pkg/util/version"
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -512,6 +513,10 @@ func TestJobStrategy_PrepareForUpdate(t *testing.T) {
for name, tc := range cases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobSuccessPolicy, tc.enableJobSuccessPolicy)
@ -893,6 +898,10 @@ func TestJobStrategy_PrepareForCreate(t *testing.T) {
for name, tc := range cases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobManagedBy, tc.enableJobManageBy)
@ -1165,6 +1174,10 @@ func TestJobStrategy_ValidateUpdate(t *testing.T) {
}
for name, tc := range cases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
newJob := tc.job.DeepCopy()
tc.update(newJob)
@ -1775,6 +1788,10 @@ func TestJobStrategy_Validate(t *testing.T) {
}
for name, tc := range testcases {
t.Run(name, func(t *testing.T) {
if !tc.enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, utilfeature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)
errs := Strategy.Validate(ctx, tc.job)
if len(errs) != int(tc.wantWarningCount) {

View File

@ -238,8 +238,6 @@ message JobSpec {
// batch.kubernetes.io/job-index-failure-count annotation. It can only
// be set when Job's completionMode=Indexed, and the Pod's restart
// policy is Never. The field is immutable.
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
optional int32 backoffLimitPerIndex = 12;
@ -251,8 +249,6 @@ message JobSpec {
// It can only be specified when backoffLimitPerIndex is set.
// It can be null or up to completions. It is required and must be
// less than or equal to 10^4 when is completions greater than 10^5.
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
optional int32 maxFailedIndexes = 13;
@ -442,8 +438,6 @@ message JobStatus {
// represented as "1,3-5,7".
// The set of failed indexes cannot overlap with the set of completed indexes.
//
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
optional string failedIndexes = 10;
@ -554,8 +548,6 @@ message PodFailurePolicyRule {
// running pods are terminated.
// - FailIndex: indicates that the pod's index is marked as Failed and will
// not be restarted.
// This value is beta-level. It can be used when the
// `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).
// - Ignore: indicates that the counter towards the .backoffLimit is not
// incremented and a replacement pod is created.
// - Count: indicates that the pod is handled in the default way - the

View File

@ -128,7 +128,6 @@ const (
// This is an action which might be taken on a pod failure - mark the
// Job's index as failed to avoid restarts within this index. This action
// can only be used when backoffLimitPerIndex is set.
// This value is beta-level.
PodFailurePolicyActionFailIndex PodFailurePolicyAction = "FailIndex"
// This is an action which might be taken on a pod failure - the counter towards
@ -223,8 +222,6 @@ type PodFailurePolicyRule struct {
// running pods are terminated.
// - FailIndex: indicates that the pod's index is marked as Failed and will
// not be restarted.
// This value is beta-level. It can be used when the
// `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).
// - Ignore: indicates that the counter towards the .backoffLimit is not
// incremented and a replacement pod is created.
// - Count: indicates that the pod is handled in the default way - the
@ -362,8 +359,6 @@ type JobSpec struct {
// batch.kubernetes.io/job-index-failure-count annotation. It can only
// be set when Job's completionMode=Indexed, and the Pod's restart
// policy is Never. The field is immutable.
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
BackoffLimitPerIndex *int32 `json:"backoffLimitPerIndex,omitempty" protobuf:"varint,12,opt,name=backoffLimitPerIndex"`
@ -375,8 +370,6 @@ type JobSpec struct {
// It can only be specified when backoffLimitPerIndex is set.
// It can be null or up to completions. It is required and must be
// less than or equal to 10^4 when is completions greater than 10^5.
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
MaxFailedIndexes *int32 `json:"maxFailedIndexes,omitempty" protobuf:"varint,13,opt,name=maxFailedIndexes"`
@ -571,8 +564,6 @@ type JobStatus struct {
// represented as "1,3-5,7".
// The set of failed indexes cannot overlap with the set of completed indexes.
//
// This field is beta-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (enabled by default).
// +optional
FailedIndexes *string `json:"failedIndexes,omitempty" protobuf:"bytes,10,opt,name=failedIndexes"`

View File

@ -118,8 +118,8 @@ var map_JobSpec = map[string]string{
"podFailurePolicy": "Specifies the policy of handling failed pods. In particular, it allows to specify the set of actions and conditions which need to be satisfied to take the associated action. If empty, the default behaviour applies - the counter of failed pods, represented by the jobs's .status.failed field, is incremented and it is checked against the backoffLimit. This field cannot be used in combination with restartPolicy=OnFailure.",
"successPolicy": "successPolicy specifies the policy when the Job can be declared as succeeded. If empty, the default behavior applies - the Job is declared as succeeded only when the number of succeeded pods equals to the completions. When the field is specified, it must be immutable and works only for the Indexed Jobs. Once the Job meets the SuccessPolicy, the lingering pods are terminated.\n\nThis field is beta-level. To use this field, you must enable the `JobSuccessPolicy` feature gate (enabled by default).",
"backoffLimit": "Specifies the number of retries before marking this job failed. Defaults to 6",
"backoffLimitPerIndex": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"maxFailedIndexes": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"backoffLimitPerIndex": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable.",
"maxFailedIndexes": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5.",
"selector": "A label query over pods that should match the pod count. Normally, the system sets this field for you. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors",
"manualSelector": "manualSelector controls generation of pod labels and pod selectors. Leave `manualSelector` unset unless you are certain what you are doing. When false or unset, the system pick labels unique to this job and appends those labels to the pod template. When true, the user is responsible for picking unique labels and specifying the selector. Failure to pick a unique label may cause this and other jobs to not function correctly. However, You may see `manualSelector=true` in jobs that were created with the old `extensions/v1beta1` API. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#specifying-your-own-pod-selector",
"template": "Describes the pod that will be created when executing a job. The only allowed template.spec.restartPolicy values are \"Never\" or \"OnFailure\". More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/",
@ -144,7 +144,7 @@ var map_JobStatus = map[string]string{
"failed": "The number of pods which reached phase Failed. The value increases monotonically.",
"terminating": "The number of pods which are terminating (in phase Pending or Running and have a deletionTimestamp).\n\nThis field is beta-level. The job controller populates the field when the feature gate JobPodReplacementPolicy is enabled (enabled by default).",
"completedIndexes": "completedIndexes holds the completed indexes when .spec.completionMode = \"Indexed\" in a text format. The indexes are represented as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the completed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\".",
"failedIndexes": "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.\n\nThis field is beta-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).",
"failedIndexes": "FailedIndexes holds the failed indexes when spec.backoffLimitPerIndex is set. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". The set of failed indexes cannot overlap with the set of completed indexes.",
"uncountedTerminatedPods": "uncountedTerminatedPods holds the UIDs of Pods that have terminated but the job controller hasn't yet accounted for in the status counters.\n\nThe job controller creates pods with a finalizer. When a pod terminates (succeeded or failed), the controller does three steps to account for it in the job status:\n\n1. Add the pod UID to the arrays in this field. 2. Remove the pod finalizer. 3. Remove the pod UID from the arrays while increasing the corresponding\n counter.\n\nOld jobs might not be tracked using this field, in which case the field remains null. The structure is empty for finished jobs.",
"ready": "The number of active pods which have a Ready condition and are not terminating (without a deletionTimestamp).",
}
@ -195,7 +195,7 @@ func (PodFailurePolicyOnPodConditionsPattern) SwaggerDoc() map[string]string {
var map_PodFailurePolicyRule = map[string]string{
"": "PodFailurePolicyRule describes how a pod failure is handled when the requirements are met. One of onExitCodes and onPodConditions, but not both, can be used in each rule.",
"action": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is beta-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (enabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"action": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"onExitCodes": "Represents the requirement on the container exit codes.",
"onPodConditions": "Represents the requirement on the pod conditions. The requirement is represented as a list of pod condition patterns. The requirement is satisfied if at least one pattern matches an actual pod condition. At most 20 elements are allowed.",
}

View File

@ -1109,6 +1109,17 @@
and delete the job. Job MUST be deleted successfully.
release: v1.15
file: test/e2e/apps/job.go
- testname: Ensure that all indexes are executed for an indexed job with backoffLimitPerIndex
despite some failing
codename: '[sig-apps] Job should execute all indexes despite some failing when using
backoffLimitPerIndex [Conformance]'
description: Create an indexed job and ensure that all indexes are either failed
or succeeded, depending on the end state of the corresponding pods. Pods with
odd indexes fail, while the pods with even indexes succeeded. Also, verify that
the number of failed pods doubles the number of failing indexes, as the backoffLimitPerIndex=1,
allowing for one pod recreation before marking that indexed failed.
release: v1.33
file: test/e2e/apps/job.go
- testname: Jobs, manage lifecycle
codename: '[sig-apps] Job should manage the lifecycle of a job [Conformance]'
description: Attempt to create a suspended Job which MUST succeed. Attempt to patch
@ -1119,6 +1130,15 @@
via a label selector.
release: v1.25
file: test/e2e/apps/job.go
- testname: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy
codename: '[sig-apps] Job should mark indexes as failed when the FailIndex action
is matched in podFailurePolicy [Conformance]'
description: Create an indexed job with backoffLimitPerIndex, and podFailurePolicy
with the FailIndex action. Verify the failed pods matching the pod failure policy
result in marking the corresponding indexes as failed without restarts, despite
backoffLimitPerIndex > 0.
release: v1.33
file: test/e2e/apps/job.go
- testname: Jobs, completion after task failure
codename: '[sig-apps] Job should run a job to completion when tasks sometimes fail
and are locally restarted [Conformance]'
@ -1126,6 +1146,14 @@
the Job MUST execute to completion.
release: v1.16
file: test/e2e/apps/job.go
- testname: Terminate job execution when the maxFailedIndexes is exceeded
codename: '[sig-apps] Job should terminate job execution when the number of failed
indexes exceeds maxFailedIndexes [Conformance]'
description: Create an indexed job with backoffLimitPerIndex and maxFailedIndexes.
Verify the job execution is terminated as soon as the number of failed indexes
exceeds maxFailedIndexes.
release: v1.33
file: test/e2e/apps/job.go
- testname: ReplicaSet, is created, Replaced and Patched
codename: '[sig-apps] ReplicaSet Replace and Patch tests [Conformance]'
description: Create a ReplicaSet (RS) with a single Pod. The Pod MUST be verified

View File

@ -593,13 +593,14 @@ done`}
})
/*
Testcase: Ensure that all indexes are executed for an indexed job with backoffLimitPerIndex despite some failing
Release: v1.33
Testname: Ensure that all indexes are executed for an indexed job with backoffLimitPerIndex despite some failing
Description: Create an indexed job and ensure that all indexes are either failed or succeeded, depending
on the end state of the corresponding pods. Pods with odd indexes fail, while the pods with even indexes
succeeded. Also, verify that the number of failed pods doubles the number of failing indexes, as the
backoffLimitPerIndex=1, allowing for one pod recreation before marking that indexed failed.
*/
ginkgo.It("should execute all indexes despite some failing when using backoffLimitPerIndex", func(ctx context.Context) {
framework.ConformanceIt("should execute all indexes despite some failing when using backoffLimitPerIndex", func(ctx context.Context) {
parallelism := int32(2)
completions := int32(4)
backoffLimit := int32(6) // default value
@ -627,12 +628,13 @@ done`}
})
/*
Testcase: Terminate job execution when the maxFailedIndexes is exceeded
Release: v1.33
Testname: Terminate job execution when the maxFailedIndexes is exceeded
Description: Create an indexed job with backoffLimitPerIndex and maxFailedIndexes.
Verify the job execution is terminated as soon as the number of failed
indexes exceeds maxFailedIndexes.
*/
ginkgo.It("should terminate job execution when the number of failed indexes exceeds maxFailedIndexes", func(ctx context.Context) {
framework.ConformanceIt("should terminate job execution when the number of failed indexes exceeds maxFailedIndexes", func(ctx context.Context) {
// we use parallelism=1 to make sure in the asserts only one pod was created
parallelism := int32(1)
completions := int32(4)
@ -712,13 +714,14 @@ done`}
})
/*
Testcase: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy
Release: v1.33
Testname: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy
Description: Create an indexed job with backoffLimitPerIndex, and podFailurePolicy
with the FailIndex action. Verify the failed pods matching the pod failure policy
result in marking the corresponding indexes as failed without restarts, despite
backoffLimitPerIndex > 0.
*/
ginkgo.It("should mark indexes as failed when the FailIndex action is matched in podFailurePolicy", func(ctx context.Context) {
framework.ConformanceIt("should mark indexes as failed when the FailIndex action is matched in podFailurePolicy", func(ctx context.Context) {
parallelism := int32(2)
completions := int32(2)
backoffLimit := int32(6) // default value

View File

@ -560,6 +560,10 @@
lockToDefault: false
preRelease: Beta
version: "1.29"
- default: true
lockToDefault: true
preRelease: GA
version: "1.33"
- name: JobManagedBy
versionedSpecs:
- default: false

View File

@ -37,6 +37,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
utilversion "k8s.io/apimachinery/pkg/util/version"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/util/feature"
@ -824,6 +825,10 @@ func TestSuccessPolicy(t *testing.T) {
t.Run(name, func(t *testing.T) {
resetMetrics()
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobSuccessPolicy, tc.enableJobSuccessPolicy)
if !tc.enableBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableBackoffLimitPerIndex)
ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
@ -1028,6 +1033,7 @@ func TestBackoffLimitPerIndex_DelayedPodDeletion(t *testing.T) {
// TestBackoffLimitPerIndex_Reenabling tests handling of pod failures when
// reenabling the BackoffLimitPerIndex feature.
func TestBackoffLimitPerIndex_Reenabling(t *testing.T) {
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(t, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)
@ -3425,6 +3431,10 @@ func BenchmarkLargeIndexedJob(b *testing.B) {
for name, tc := range cases {
b.Run(name, func(b *testing.B) {
enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil
if !enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(b, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)
b.ResetTimer()
for n := 0; n < b.N; n++ {
@ -3512,6 +3522,10 @@ func BenchmarkLargeFailureHandling(b *testing.B) {
b.Run(name, func(b *testing.B) {
enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil
timeout := ptr.Deref(tc.customTimeout, wait.ForeverTestTimeout)
if !enableJobBackoffLimitPerIndex {
// TODO: this will be removed in 1.36
featuregatetesting.SetFeatureGateEmulationVersionDuringTest(b, feature.DefaultFeatureGate, utilversion.MustParse("1.32"))
}
featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)
b.ResetTimer()
for n := 0; n < b.N; n++ {