Merge pull request #101292 from AliceZhang2016/job_controller_metrics

Graduate indexed job to beta
This commit is contained in:
Kubernetes Prow Robot
2021-05-07 13:31:44 -07:00
committed by GitHub
14 changed files with 143 additions and 23 deletions

View File

@@ -170,7 +170,7 @@ func getCompletionIndex(annotations map[string]string) int {
if annotations == nil {
return unknownCompletionIndex
}
v, ok := annotations[batch.JobCompletionIndexAnnotationAlpha]
v, ok := annotations[batch.JobCompletionIndexAnnotation]
if !ok {
return unknownCompletionIndex
}
@@ -203,7 +203,7 @@ func addCompletionIndexEnvVariable(container *v1.Container) {
Name: completionIndexEnvName,
ValueFrom: &v1.EnvVarSource{
FieldRef: &v1.ObjectFieldSelector{
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
},
},
})
@@ -213,7 +213,7 @@ func addCompletionIndexAnnotation(template *v1.PodTemplateSpec, index int) {
if template.Annotations == nil {
template.Annotations = make(map[string]string, 1)
}
template.Annotations[batch.JobCompletionIndexAnnotationAlpha] = strconv.Itoa(index)
template.Annotations[batch.JobCompletionIndexAnnotation] = strconv.Itoa(index)
}
type byCompletionIndex []*v1.Pod

View File

@@ -279,7 +279,7 @@ func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {
}
if desc.Index != noIndex {
p.Annotations = map[string]string{
batch.JobCompletionIndexAnnotationAlpha: desc.Index,
batch.JobCompletionIndexAnnotation: desc.Index,
}
}
pods = append(pods, p)
@@ -297,7 +297,7 @@ func toIndexPhases(pods []*v1.Pod) []indexPhase {
for i, p := range pods {
index := noIndex
if p.Annotations != nil {
index = p.Annotations[batch.JobCompletionIndexAnnotationAlpha]
index = p.Annotations[batch.JobCompletionIndexAnnotation]
}
result[i] = indexPhase{index, p.Status.Phase}
}

View File

@@ -47,6 +47,7 @@ import (
"k8s.io/component-base/metrics/prometheus/ratelimiter"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/job/metrics"
"k8s.io/kubernetes/pkg/features"
"k8s.io/utils/integer"
)
@@ -60,7 +61,8 @@ var (
// DefaultJobBackOff is the default backoff period, exported for the e2e test
DefaultJobBackOff = 10 * time.Second
// MaxJobBackOff is the max backoff period, exported for the e2e test
MaxJobBackOff = 360 * time.Second
MaxJobBackOff = 360 * time.Second
maxPodCreateDeletePerSync = 500
)
// Controller ensures that all Job objects have corresponding pods to
@@ -139,6 +141,8 @@ func NewController(podInformer coreinformers.PodInformer, jobInformer batchinfor
jm.updateHandler = jm.updateJobStatus
jm.syncHandler = jm.syncJob
metrics.Register()
return jm
}
@@ -440,7 +444,7 @@ func (jm *Controller) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
// concurrently with the same key.
func (jm *Controller) syncJob(key string) (bool, error) {
func (jm *Controller) syncJob(key string) (forget bool, rErr error) {
startTime := time.Now()
defer func() {
klog.V(4).Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
@@ -480,6 +484,21 @@ func (jm *Controller) syncJob(key string) (bool, error) {
return false, nil
}
completionMode := string(batch.NonIndexedCompletion)
if isIndexedJob(&job) {
completionMode = string(batch.IndexedCompletion)
}
defer func() {
result := "success"
if rErr != nil {
result = "error"
}
metrics.JobSyncDurationSeconds.WithLabelValues(completionMode, result).Observe(time.Since(startTime).Seconds())
metrics.JobSyncNum.WithLabelValues(completionMode, result).Inc()
}()
// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
// and update the expectations after we've retrieved active pods from the store. If a new pod enters
// the store after we've checked the expectation, the job sync is just deferred till the next relist.
@@ -546,6 +565,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, v1.ConditionTrue, failureReason, failureMessage))
jobConditionsChanged = true
jm.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage)
metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc()
} else {
if jobNeedsSync && job.DeletionTimestamp == nil {
active, manageJobErr = jm.manageJob(&job, activePods, succeeded, pods)
@@ -581,6 +601,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
now := metav1.Now()
job.Status.CompletionTime = &now
jm.recorder.Event(&job, v1.EventTypeNormal, "Completed", "Job completed")
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
} else if utilfeature.DefaultFeatureGate.Enabled(features.SuspendJob) && manageJobCalled {
// Update the conditions / emit events only if manageJob was called in
// this syncJob. Otherwise wait for the right syncJob call to make
@@ -613,7 +634,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
}
}
forget := false
forget = false
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
@@ -783,6 +804,9 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
rmAtLeast = 0
}
podsToDelete := activePodsForRemoval(job, activePods, int(rmAtLeast))
if len(podsToDelete) > maxPodCreateDeletePerSync {
podsToDelete = podsToDelete[:maxPodCreateDeletePerSync]
}
if len(podsToDelete) > 0 {
jm.expectations.ExpectDeletions(jobKey, len(podsToDelete))
klog.V(4).InfoS("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", parallelism)
@@ -803,6 +827,10 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
return active, nil
}
if diff > int32(maxPodCreateDeletePerSync) {
diff = int32(maxPodCreateDeletePerSync)
}
jm.expectations.ExpectCreations(jobKey, int(diff))
errCh := make(chan error, diff)
klog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff)

View File

@@ -149,7 +149,7 @@ func setPodsStatusesWithIndexes(podIndexer cache.Indexer, job *batch.Job, status
p.Status = v1.PodStatus{Phase: s.Phase}
if s.Index != noIndex {
p.Annotations = map[string]string{
batch.JobCompletionIndexAnnotationAlpha: s.Index,
batch.JobCompletionIndexAnnotation: s.Index,
}
}
podIndexer.Add(p)
@@ -2176,7 +2176,7 @@ func checkJobCompletionEnvVariable(t *testing.T, spec *v1.PodSpec) {
Name: "JOB_COMPLETION_INDEX",
ValueFrom: &v1.EnvVarSource{
FieldRef: &v1.ObjectFieldSelector{
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
},
},
},

View File

@@ -0,0 +1,75 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// JobControllerSubsystem - subsystem name used for this controller.
const JobControllerSubsystem = "job_controller"
var (
// JobSyncDurationSeconds tracks the latency of job syncs as
// completion_mode = Indexed / NonIndexed and result = success / error.
JobSyncDurationSeconds = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: JobControllerSubsystem,
Name: "job_sync_duration_seconds",
Help: "The time it took to sync a job",
StabilityLevel: metrics.ALPHA,
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
},
[]string{"completion_mode", "result"},
)
// JobSyncNum tracks the number of job syncs as
// completion_mode = Indexed / NonIndexed and result = success / error.
JobSyncNum = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,
Name: "job_sync_total",
Help: "The number of job syncs",
StabilityLevel: metrics.ALPHA,
},
[]string{"completion_mode", "result"},
)
// JobFinishedNum tracks the number of jobs that finish as
// completion_mode = Indexed / NonIndexed and result = failed / succeeded.
JobFinishedNum = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: JobControllerSubsystem,
Name: "job_finished_total",
Help: "The number of finished job",
StabilityLevel: metrics.ALPHA,
},
[]string{"completion_mode", "result"},
)
)
var registerMetrics sync.Once
// Register registers Job controller metrics.
func Register() {
registerMetrics.Do(func() {
legacyregistry.MustRegister(JobSyncDurationSeconds)
legacyregistry.MustRegister(JobSyncNum)
legacyregistry.MustRegister(JobFinishedNum)
})
}