mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-24 03:18:57 +00:00
Merge pull request #101292 from AliceZhang2016/job_controller_metrics
Graduate indexed job to beta
This commit is contained in:
@@ -170,7 +170,7 @@ func getCompletionIndex(annotations map[string]string) int {
|
||||
if annotations == nil {
|
||||
return unknownCompletionIndex
|
||||
}
|
||||
v, ok := annotations[batch.JobCompletionIndexAnnotationAlpha]
|
||||
v, ok := annotations[batch.JobCompletionIndexAnnotation]
|
||||
if !ok {
|
||||
return unknownCompletionIndex
|
||||
}
|
||||
@@ -203,7 +203,7 @@ func addCompletionIndexEnvVariable(container *v1.Container) {
|
||||
Name: completionIndexEnvName,
|
||||
ValueFrom: &v1.EnvVarSource{
|
||||
FieldRef: &v1.ObjectFieldSelector{
|
||||
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
|
||||
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -213,7 +213,7 @@ func addCompletionIndexAnnotation(template *v1.PodTemplateSpec, index int) {
|
||||
if template.Annotations == nil {
|
||||
template.Annotations = make(map[string]string, 1)
|
||||
}
|
||||
template.Annotations[batch.JobCompletionIndexAnnotationAlpha] = strconv.Itoa(index)
|
||||
template.Annotations[batch.JobCompletionIndexAnnotation] = strconv.Itoa(index)
|
||||
}
|
||||
|
||||
type byCompletionIndex []*v1.Pod
|
||||
|
@@ -279,7 +279,7 @@ func hollowPodsWithIndexPhase(descs []indexPhase) []*v1.Pod {
|
||||
}
|
||||
if desc.Index != noIndex {
|
||||
p.Annotations = map[string]string{
|
||||
batch.JobCompletionIndexAnnotationAlpha: desc.Index,
|
||||
batch.JobCompletionIndexAnnotation: desc.Index,
|
||||
}
|
||||
}
|
||||
pods = append(pods, p)
|
||||
@@ -297,7 +297,7 @@ func toIndexPhases(pods []*v1.Pod) []indexPhase {
|
||||
for i, p := range pods {
|
||||
index := noIndex
|
||||
if p.Annotations != nil {
|
||||
index = p.Annotations[batch.JobCompletionIndexAnnotationAlpha]
|
||||
index = p.Annotations[batch.JobCompletionIndexAnnotation]
|
||||
}
|
||||
result[i] = indexPhase{index, p.Status.Phase}
|
||||
}
|
||||
|
@@ -47,6 +47,7 @@ import (
|
||||
"k8s.io/component-base/metrics/prometheus/ratelimiter"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/kubernetes/pkg/controller"
|
||||
"k8s.io/kubernetes/pkg/controller/job/metrics"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/utils/integer"
|
||||
)
|
||||
@@ -60,7 +61,8 @@ var (
|
||||
// DefaultJobBackOff is the default backoff period, exported for the e2e test
|
||||
DefaultJobBackOff = 10 * time.Second
|
||||
// MaxJobBackOff is the max backoff period, exported for the e2e test
|
||||
MaxJobBackOff = 360 * time.Second
|
||||
MaxJobBackOff = 360 * time.Second
|
||||
maxPodCreateDeletePerSync = 500
|
||||
)
|
||||
|
||||
// Controller ensures that all Job objects have corresponding pods to
|
||||
@@ -139,6 +141,8 @@ func NewController(podInformer coreinformers.PodInformer, jobInformer batchinfor
|
||||
jm.updateHandler = jm.updateJobStatus
|
||||
jm.syncHandler = jm.syncJob
|
||||
|
||||
metrics.Register()
|
||||
|
||||
return jm
|
||||
}
|
||||
|
||||
@@ -440,7 +444,7 @@ func (jm *Controller) getPodsForJob(j *batch.Job) ([]*v1.Pod, error) {
|
||||
// syncJob will sync the job with the given key if it has had its expectations fulfilled, meaning
|
||||
// it did not expect to see any more of its pods created or deleted. This function is not meant to be invoked
|
||||
// concurrently with the same key.
|
||||
func (jm *Controller) syncJob(key string) (bool, error) {
|
||||
func (jm *Controller) syncJob(key string) (forget bool, rErr error) {
|
||||
startTime := time.Now()
|
||||
defer func() {
|
||||
klog.V(4).Infof("Finished syncing job %q (%v)", key, time.Since(startTime))
|
||||
@@ -480,6 +484,21 @@ func (jm *Controller) syncJob(key string) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
completionMode := string(batch.NonIndexedCompletion)
|
||||
if isIndexedJob(&job) {
|
||||
completionMode = string(batch.IndexedCompletion)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
result := "success"
|
||||
if rErr != nil {
|
||||
result = "error"
|
||||
}
|
||||
|
||||
metrics.JobSyncDurationSeconds.WithLabelValues(completionMode, result).Observe(time.Since(startTime).Seconds())
|
||||
metrics.JobSyncNum.WithLabelValues(completionMode, result).Inc()
|
||||
}()
|
||||
|
||||
// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
|
||||
// and update the expectations after we've retrieved active pods from the store. If a new pod enters
|
||||
// the store after we've checked the expectation, the job sync is just deferred till the next relist.
|
||||
@@ -546,6 +565,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
|
||||
job.Status.Conditions = append(job.Status.Conditions, newCondition(batch.JobFailed, v1.ConditionTrue, failureReason, failureMessage))
|
||||
jobConditionsChanged = true
|
||||
jm.recorder.Event(&job, v1.EventTypeWarning, failureReason, failureMessage)
|
||||
metrics.JobFinishedNum.WithLabelValues(completionMode, "failed").Inc()
|
||||
} else {
|
||||
if jobNeedsSync && job.DeletionTimestamp == nil {
|
||||
active, manageJobErr = jm.manageJob(&job, activePods, succeeded, pods)
|
||||
@@ -581,6 +601,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
|
||||
now := metav1.Now()
|
||||
job.Status.CompletionTime = &now
|
||||
jm.recorder.Event(&job, v1.EventTypeNormal, "Completed", "Job completed")
|
||||
metrics.JobFinishedNum.WithLabelValues(completionMode, "succeeded").Inc()
|
||||
} else if utilfeature.DefaultFeatureGate.Enabled(features.SuspendJob) && manageJobCalled {
|
||||
// Update the conditions / emit events only if manageJob was called in
|
||||
// this syncJob. Otherwise wait for the right syncJob call to make
|
||||
@@ -613,7 +634,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
|
||||
}
|
||||
}
|
||||
|
||||
forget := false
|
||||
forget = false
|
||||
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
|
||||
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
|
||||
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
|
||||
@@ -783,6 +804,9 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
|
||||
rmAtLeast = 0
|
||||
}
|
||||
podsToDelete := activePodsForRemoval(job, activePods, int(rmAtLeast))
|
||||
if len(podsToDelete) > maxPodCreateDeletePerSync {
|
||||
podsToDelete = podsToDelete[:maxPodCreateDeletePerSync]
|
||||
}
|
||||
if len(podsToDelete) > 0 {
|
||||
jm.expectations.ExpectDeletions(jobKey, len(podsToDelete))
|
||||
klog.V(4).InfoS("Too many pods running for job", "job", klog.KObj(job), "deleted", len(podsToDelete), "target", parallelism)
|
||||
@@ -803,6 +827,10 @@ func (jm *Controller) manageJob(job *batch.Job, activePods []*v1.Pod, succeeded
|
||||
return active, nil
|
||||
}
|
||||
|
||||
if diff > int32(maxPodCreateDeletePerSync) {
|
||||
diff = int32(maxPodCreateDeletePerSync)
|
||||
}
|
||||
|
||||
jm.expectations.ExpectCreations(jobKey, int(diff))
|
||||
errCh := make(chan error, diff)
|
||||
klog.V(4).Infof("Too few pods running job %q, need %d, creating %d", jobKey, wantActive, diff)
|
||||
|
@@ -149,7 +149,7 @@ func setPodsStatusesWithIndexes(podIndexer cache.Indexer, job *batch.Job, status
|
||||
p.Status = v1.PodStatus{Phase: s.Phase}
|
||||
if s.Index != noIndex {
|
||||
p.Annotations = map[string]string{
|
||||
batch.JobCompletionIndexAnnotationAlpha: s.Index,
|
||||
batch.JobCompletionIndexAnnotation: s.Index,
|
||||
}
|
||||
}
|
||||
podIndexer.Add(p)
|
||||
@@ -2176,7 +2176,7 @@ func checkJobCompletionEnvVariable(t *testing.T, spec *v1.PodSpec) {
|
||||
Name: "JOB_COMPLETION_INDEX",
|
||||
ValueFrom: &v1.EnvVarSource{
|
||||
FieldRef: &v1.ObjectFieldSelector{
|
||||
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotationAlpha),
|
||||
FieldPath: fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation),
|
||||
},
|
||||
},
|
||||
},
|
||||
|
75
pkg/controller/job/metrics/metrics.go
Normal file
75
pkg/controller/job/metrics/metrics.go
Normal file
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
Copyright 2021 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"k8s.io/component-base/metrics"
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
)
|
||||
|
||||
// JobControllerSubsystem - subsystem name used for this controller.
|
||||
const JobControllerSubsystem = "job_controller"
|
||||
|
||||
var (
|
||||
// JobSyncDurationSeconds tracks the latency of job syncs as
|
||||
// completion_mode = Indexed / NonIndexed and result = success / error.
|
||||
JobSyncDurationSeconds = metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: JobControllerSubsystem,
|
||||
Name: "job_sync_duration_seconds",
|
||||
Help: "The time it took to sync a job",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
|
||||
},
|
||||
[]string{"completion_mode", "result"},
|
||||
)
|
||||
// JobSyncNum tracks the number of job syncs as
|
||||
// completion_mode = Indexed / NonIndexed and result = success / error.
|
||||
JobSyncNum = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: JobControllerSubsystem,
|
||||
Name: "job_sync_total",
|
||||
Help: "The number of job syncs",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"completion_mode", "result"},
|
||||
)
|
||||
// JobFinishedNum tracks the number of jobs that finish as
|
||||
// completion_mode = Indexed / NonIndexed and result = failed / succeeded.
|
||||
JobFinishedNum = metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Subsystem: JobControllerSubsystem,
|
||||
Name: "job_finished_total",
|
||||
Help: "The number of finished job",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"completion_mode", "result"},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
|
||||
// Register registers Job controller metrics.
|
||||
func Register() {
|
||||
registerMetrics.Do(func() {
|
||||
legacyregistry.MustRegister(JobSyncDurationSeconds)
|
||||
legacyregistry.MustRegister(JobSyncNum)
|
||||
legacyregistry.MustRegister(JobFinishedNum)
|
||||
})
|
||||
}
|
Reference in New Issue
Block a user