job: Ignore namespace termination errors when creating pods or jobs

Instead of reporting an event or displaying an error, simply exit
when the namespace is being terminated. This reduces the amount of
controller churn on namespace shutdown. While we could technically
exit the entire processing loop early for very large jobs,
we should wait for more evidence that is an issue before changing
that logic substantially.
This commit is contained in:
Clayton Coleman 2019-10-20 16:25:44 -04:00
parent 8f74c8970b
commit c6e34e58c5
No known key found for this signature in database
GPG Key ID: 3D16906B4F1C5CB3
3 changed files with 23 additions and 10 deletions

View File

@ -20,6 +20,7 @@ go_library(
"//staging/src/k8s.io/api/batch/v1:go_default_library", "//staging/src/k8s.io/api/batch/v1:go_default_library",
"//staging/src/k8s.io/api/batch/v1beta1:go_default_library", "//staging/src/k8s.io/api/batch/v1beta1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library",

View File

@ -39,6 +39,7 @@ import (
batchv1 "k8s.io/api/batch/v1" batchv1 "k8s.io/api/batch/v1"
batchv1beta1 "k8s.io/api/batch/v1beta1" batchv1beta1 "k8s.io/api/batch/v1beta1"
"k8s.io/api/core/v1" "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
@ -333,7 +334,11 @@ func syncOne(sj *batchv1beta1.CronJob, js []batchv1.Job, now time.Time, jc jobCo
} }
jobResp, err := jc.CreateJob(sj.Namespace, jobReq) jobResp, err := jc.CreateJob(sj.Namespace, jobReq)
if err != nil { if err != nil {
recorder.Eventf(sj, v1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err) // If the namespace is being torn down, we can safely ignore
// this error since all subsequent creations will fail.
if !errors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
recorder.Eventf(sj, v1.EventTypeWarning, "FailedCreate", "Error creating job: %v", err)
}
return return
} }
klog.V(4).Infof("Created Job %s for %s", jobResp.Name, nameForLog) klog.V(4).Infof("Created Job %s for %s", jobResp.Name, nameForLog)

View File

@ -771,15 +771,22 @@ func (jm *JobController) manageJob(activePods []*v1.Pod, succeeded int32, job *b
go func() { go func() {
defer wait.Done() defer wait.Done()
err := jm.podControl.CreatePodsWithControllerRef(job.Namespace, &job.Spec.Template, job, metav1.NewControllerRef(job, controllerKind)) err := jm.podControl.CreatePodsWithControllerRef(job.Namespace, &job.Spec.Template, job, metav1.NewControllerRef(job, controllerKind))
if err != nil && errors.IsTimeout(err) { if err != nil {
// Pod is created but its initialization has timed out. if errors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
// If the initialization is successful eventually, the // If the namespace is being torn down, we can safely ignore
// controller will observe the creation via the informer. // this error since all subsequent creations will fail.
// If the initialization fails, or if the pod keeps return
// uninitialized for a long time, the informer will not }
// receive any update, and the controller will create a new if errors.IsTimeout(err) {
// pod when the expectation expires. // Pod is created but its initialization has timed out.
return // If the initialization is successful eventually, the
// controller will observe the creation via the informer.
// If the initialization fails, or if the pod keeps
// uninitialized for a long time, the informer will not
// receive any update, and the controller will create a new
// pod when the expectation expires.
return
}
} }
if err != nil { if err != nil {
defer utilruntime.HandleError(err) defer utilruntime.HandleError(err)