mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-12 05:21:58 +00:00
Merge pull request #120175 from kannon92/move-pod-failure-policy-constant
move reasons to api package for job controller
This commit is contained in:
commit
fd5f36e6a0
@ -56,12 +56,6 @@ import (
|
|||||||
"k8s.io/utils/pointer"
|
"k8s.io/utils/pointer"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
|
||||||
// PodFailurePolicy reason indicates a job failure condition is added due to
|
|
||||||
// a failed pod matching a pod failure policy rule
|
|
||||||
jobConditionReasonPodFailurePolicy = "PodFailurePolicy"
|
|
||||||
)
|
|
||||||
|
|
||||||
// controllerKind contains the schema.GroupVersionKind for this controller type.
|
// controllerKind contains the schema.GroupVersionKind for this controller type.
|
||||||
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")
|
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")
|
||||||
|
|
||||||
@ -85,6 +79,15 @@ var (
|
|||||||
MaxPodCreateDeletePerSync = 500
|
MaxPodCreateDeletePerSync = 500
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// MaxFailedIndexesExceeded indicates that an indexed of a job failed
|
||||||
|
// https://kep.k8s.io/3850
|
||||||
|
// In Beta, this should be moved to staging as an API field.
|
||||||
|
jobReasonMaxFailedIndexesExceeded string = "MaxFailedIndexesExceeded"
|
||||||
|
// FailedIndexes means Job has failed indexes.
|
||||||
|
jobReasonFailedIndexes string = "FailedIndexes"
|
||||||
|
)
|
||||||
|
|
||||||
// Controller ensures that all Job objects have corresponding pods to
|
// Controller ensures that all Job objects have corresponding pods to
|
||||||
// run their configured workload.
|
// run their configured workload.
|
||||||
type Controller struct {
|
type Controller struct {
|
||||||
@ -816,16 +819,16 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
|
|||||||
jobCtx.finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition, jm.clock.Now())
|
jobCtx.finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition, jm.clock.Now())
|
||||||
} else if failJobMessage := getFailJobMessage(&job, pods); failJobMessage != nil {
|
} else if failJobMessage := getFailJobMessage(&job, pods); failJobMessage != nil {
|
||||||
// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
|
// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
|
||||||
jobCtx.finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage, jm.clock.Now())
|
jobCtx.finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, batch.JobReasonPodFailurePolicy, *failJobMessage, jm.clock.Now())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if jobCtx.finishedCondition == nil {
|
if jobCtx.finishedCondition == nil {
|
||||||
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
|
if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
|
||||||
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
|
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
|
||||||
// OR if the number of failed jobs increased since the last syncJob
|
// OR if the number of failed jobs increased since the last syncJob
|
||||||
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "BackoffLimitExceeded", "Job has reached the specified backoff limit", jm.clock.Now())
|
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonBackoffLimitExceeded, "Job has reached the specified backoff limit", jm.clock.Now())
|
||||||
} else if jm.pastActiveDeadline(&job) {
|
} else if jm.pastActiveDeadline(&job) {
|
||||||
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded", "Job was active longer than specified deadline", jm.clock.Now())
|
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded, "Job was active longer than specified deadline", jm.clock.Now())
|
||||||
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
|
} else if job.Spec.ActiveDeadlineSeconds != nil && !jobSuspended(&job) {
|
||||||
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - jm.clock.Since(job.Status.StartTime.Time)
|
syncDuration := time.Duration(*job.Spec.ActiveDeadlineSeconds)*time.Second - jm.clock.Since(job.Status.StartTime.Time)
|
||||||
logger.V(2).Info("Job has activeDeadlineSeconds configuration. Will sync this job again", "key", key, "nextSyncIn", syncDuration)
|
logger.V(2).Info("Job has activeDeadlineSeconds configuration. Will sync this job again", "key", key, "nextSyncIn", syncDuration)
|
||||||
@ -840,9 +843,9 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
|
|||||||
jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods)
|
jobCtx.failedIndexes = calculateFailedIndexes(logger, &job, pods)
|
||||||
if jobCtx.finishedCondition == nil {
|
if jobCtx.finishedCondition == nil {
|
||||||
if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) {
|
if job.Spec.MaxFailedIndexes != nil && jobCtx.failedIndexes.total() > int(*job.Spec.MaxFailedIndexes) {
|
||||||
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "MaxFailedIndexesExceeded", "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
|
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobReasonMaxFailedIndexesExceeded, "Job has exceeded the specified maximal number of failed indexes", jm.clock.Now())
|
||||||
} else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) {
|
} else if jobCtx.failedIndexes.total() > 0 && jobCtx.failedIndexes.total()+jobCtx.succeededIndexes.total() >= int(*job.Spec.Completions) {
|
||||||
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, "FailedIndexes", "Job has failed indexes", jm.clock.Now())
|
jobCtx.finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobReasonFailedIndexes, "Job has failed indexes", jm.clock.Now())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
|
jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
|
||||||
|
@ -1909,7 +1909,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
|||||||
expectedDeletions: 1,
|
expectedDeletions: 1,
|
||||||
expectedFailed: 1,
|
expectedFailed: 1,
|
||||||
expectedCondition: batch.JobFailed,
|
expectedCondition: batch.JobFailed,
|
||||||
expectedConditionReason: "DeadlineExceeded",
|
expectedConditionReason: batch.JobReasonDeadlineExceeded,
|
||||||
},
|
},
|
||||||
"activeDeadlineSeconds bigger than single pod execution": {
|
"activeDeadlineSeconds bigger than single pod execution": {
|
||||||
parallelism: 1,
|
parallelism: 1,
|
||||||
@ -1923,7 +1923,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
|||||||
expectedSucceeded: 1,
|
expectedSucceeded: 1,
|
||||||
expectedFailed: 1,
|
expectedFailed: 1,
|
||||||
expectedCondition: batch.JobFailed,
|
expectedCondition: batch.JobFailed,
|
||||||
expectedConditionReason: "DeadlineExceeded",
|
expectedConditionReason: batch.JobReasonDeadlineExceeded,
|
||||||
},
|
},
|
||||||
"activeDeadlineSeconds times-out before any pod starts": {
|
"activeDeadlineSeconds times-out before any pod starts": {
|
||||||
parallelism: 1,
|
parallelism: 1,
|
||||||
@ -1932,7 +1932,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
|||||||
startTime: 10,
|
startTime: 10,
|
||||||
backoffLimit: 6,
|
backoffLimit: 6,
|
||||||
expectedCondition: batch.JobFailed,
|
expectedCondition: batch.JobFailed,
|
||||||
expectedConditionReason: "DeadlineExceeded",
|
expectedConditionReason: batch.JobReasonDeadlineExceeded,
|
||||||
},
|
},
|
||||||
"activeDeadlineSeconds with backofflimit reach": {
|
"activeDeadlineSeconds with backofflimit reach": {
|
||||||
parallelism: 1,
|
parallelism: 1,
|
||||||
@ -1942,7 +1942,7 @@ func TestSyncJobPastDeadline(t *testing.T) {
|
|||||||
failedPods: 1,
|
failedPods: 1,
|
||||||
expectedFailed: 1,
|
expectedFailed: 1,
|
||||||
expectedCondition: batch.JobFailed,
|
expectedCondition: batch.JobFailed,
|
||||||
expectedConditionReason: "BackoffLimitExceeded",
|
expectedConditionReason: batch.JobReasonBackoffLimitExceeded,
|
||||||
},
|
},
|
||||||
"activeDeadlineSeconds is not triggered when Job is suspended": {
|
"activeDeadlineSeconds is not triggered when Job is suspended": {
|
||||||
suspend: true,
|
suspend: true,
|
||||||
@ -2098,7 +2098,7 @@ func TestPastDeadlineJobFinished(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
if getCondition(j, batch.JobFailed, v1.ConditionTrue, "DeadlineExceeded") {
|
if getCondition(j, batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded) {
|
||||||
if manager.clock.Since(j.Status.StartTime.Time) < time.Duration(*j.Spec.ActiveDeadlineSeconds)*time.Second {
|
if manager.clock.Since(j.Status.StartTime.Time) < time.Duration(*j.Spec.ActiveDeadlineSeconds)*time.Second {
|
||||||
return true, errors.New("Job contains DeadlineExceeded condition earlier than expected")
|
return true, errors.New("Job contains DeadlineExceeded condition earlier than expected")
|
||||||
}
|
}
|
||||||
@ -2397,7 +2397,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2425,7 +2425,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailureTarget,
|
Type: batch.JobFailureTarget,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2452,7 +2452,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2480,7 +2480,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailureTarget,
|
Type: batch.JobFailureTarget,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2507,7 +2507,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2596,7 +2596,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2642,7 +2642,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2695,7 +2695,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
|
Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2797,7 +2797,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -2924,7 +2924,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "BackoffLimitExceeded",
|
Reason: batch.JobReasonBackoffLimitExceeded,
|
||||||
Message: "Job has reached the specified backoff limit",
|
Message: "Job has reached the specified backoff limit",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -3185,7 +3185,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
|
Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -3571,13 +3571,13 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailureTarget,
|
Type: batch.JobFailureTarget,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
|
Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "PodFailurePolicy",
|
Reason: batch.JobReasonPodFailurePolicy,
|
||||||
Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
|
Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -3660,7 +3660,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "BackoffLimitExceeded",
|
Reason: batch.JobReasonBackoffLimitExceeded,
|
||||||
Message: "Job has reached the specified backoff limit",
|
Message: "Job has reached the specified backoff limit",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -3695,7 +3695,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "FailedIndexes",
|
Reason: jobReasonFailedIndexes,
|
||||||
Message: "Job has failed indexes",
|
Message: "Job has failed indexes",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -3733,7 +3733,7 @@ func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
|
|||||||
{
|
{
|
||||||
Type: batch.JobFailed,
|
Type: batch.JobFailed,
|
||||||
Status: v1.ConditionTrue,
|
Status: v1.ConditionTrue,
|
||||||
Reason: "MaxFailedIndexesExceeded",
|
Reason: jobReasonMaxFailedIndexesExceeded,
|
||||||
Message: "Job has exceeded the specified maximal number of failed indexes",
|
Message: "Job has exceeded the specified maximal number of failed indexes",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -535,6 +535,19 @@ const (
|
|||||||
JobFailureTarget JobConditionType = "FailureTarget"
|
JobFailureTarget JobConditionType = "FailureTarget"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// JobReasonPodFailurePolicy reason indicates a job failure condition is added due to
|
||||||
|
// a failed pod matching a pod failure policy rule
|
||||||
|
// https://kep.k8s.io/3329
|
||||||
|
// This is currently a beta field.
|
||||||
|
JobReasonPodFailurePolicy string = "PodFailurePolicy"
|
||||||
|
// JobReasonBackOffLimitExceeded reason indicates that pods within a job have failed a number of
|
||||||
|
// times higher than backOffLimit times.
|
||||||
|
JobReasonBackoffLimitExceeded string = "BackoffLimitExceeded"
|
||||||
|
// JobReasponDeadlineExceeded means job duration is past ActiveDeadline
|
||||||
|
JobReasonDeadlineExceeded string = "DeadlineExceeded"
|
||||||
|
)
|
||||||
|
|
||||||
// JobCondition describes current state of a job.
|
// JobCondition describes current state of a job.
|
||||||
type JobCondition struct {
|
type JobCondition struct {
|
||||||
// Type of job condition, Complete or Failed.
|
// Type of job condition, Complete or Failed.
|
||||||
|
Loading…
Reference in New Issue
Block a user