Add profile label to schedule_attempts_total metric

and e2e_scheduling_duration_seconds

Also adding result label to e2e_scheduling_duration_seconds. Previously, the metric was only updated for successful attempts

Signed-off-by: Aldo Culquicondor <acondor@google.com>
This commit is contained in:
Aldo Culquicondor
2020-06-16 15:50:57 -04:00
parent 9cd906e932
commit eb9711dc1f
6 changed files with 74 additions and 34 deletions

View File

@@ -506,13 +506,13 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// Pod did not fit anywhere, so it is counted as a failure. If preemption
// succeeds, the pod should get counted as a success the next time we try to
// schedule it. (hopefully)
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
} else if err == core.ErrNoNodesAvailable {
// No nodes available is counted as unschedulable rather than an error.
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
} else {
klog.ErrorS(err, "Error selecting node for pod", "pod", klog.KObj(pod))
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
}
sched.recordSchedulingFailure(prof, podInfo, err, v1.PodReasonUnschedulable, nominatedNode)
return
@@ -526,7 +526,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// Run "reserve" plugins.
if sts := prof.RunReservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
sched.recordSchedulingFailure(prof, assumedPodInfo, sts.AsError(), SchedulerError, "")
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
return
}
@@ -539,7 +539,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
// to a node and if so will not add it back to the unscheduled pods queue
// (otherwise this would cause an infinite loop).
sched.recordSchedulingFailure(prof, assumedPodInfo, err, SchedulerError, "")
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
// trigger un-reserve plugins to clean up state associated with the reserved Pod
prof.RunUnreservePlugins(schedulingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
return
@@ -550,10 +550,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if runPermitStatus.Code() != framework.Wait && !runPermitStatus.IsSuccess() {
var reason string
if runPermitStatus.IsUnschedulable() {
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
reason = v1.PodReasonUnschedulable
} else {
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError
}
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@@ -576,10 +576,10 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
if !waitOnPermitStatus.IsSuccess() {
var reason string
if waitOnPermitStatus.IsUnschedulable() {
metrics.PodScheduleFailures.Inc()
metrics.PodUnschedulable(prof.Name, metrics.SinceInSeconds(start))
reason = v1.PodReasonUnschedulable
} else {
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError
}
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
@@ -595,7 +595,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
preBindStatus := prof.RunPreBindPlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
if !preBindStatus.IsSuccess() {
var reason string
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
reason = SchedulerError
if forgetErr := sched.Cache().ForgetPod(assumedPod); forgetErr != nil {
klog.Errorf("scheduler cache ForgetPod failed: %v", forgetErr)
@@ -607,9 +607,8 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
}
err := sched.bind(bindingCycleCtx, prof, assumedPod, scheduleResult.SuggestedHost, state)
metrics.E2eSchedulingLatency.Observe(metrics.SinceInSeconds(start))
if err != nil {
metrics.PodScheduleErrors.Inc()
metrics.PodScheduleError(prof.Name, metrics.SinceInSeconds(start))
// trigger un-reserve plugins to clean up state associated with the reserved Pod
prof.RunUnreservePlugins(bindingCycleCtx, state, assumedPod, scheduleResult.SuggestedHost)
sched.recordSchedulingFailure(prof, assumedPodInfo, fmt.Errorf("Binding rejected: %v", err), SchedulerError, "")
@@ -619,7 +618,7 @@ func (sched *Scheduler) scheduleOne(ctx context.Context) {
klog.InfoS("Successfully bound pod to node", "pod", klog.KObj(pod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
}
metrics.PodScheduleSuccesses.Inc()
metrics.PodScheduled(prof.Name, metrics.SinceInSeconds(start))
metrics.PodSchedulingAttempts.Observe(float64(podInfo.Attempts))
metrics.PodSchedulingDuration.Observe(metrics.SinceInSeconds(podInfo.InitialAttemptTimestamp))