Merge pull request #39491 from jayunit100/sched_Histogram_error

Automatic merge from submit-queue (batch tested with PRs 34488, 39511, 39619, 38342, 39491) Update FitError as a message component into the PodConditionUpdater. Fixes #20064 , after a roundabout volley of ideas, we ended up digging into existing Conditions for this, rather then a first class API object. This is just a quick sketch of the skeleton minimal implementation, it should pretty much "just work". I'll test it more later today. Release Note: ``` Histogram data of predicate failures is contained in pod conditions and thus available to users by kubectl commands. ```
2025-07-29 22:46:12 +00:00 · 2017-01-10 16:07:17 -08:00 · 2017-01-10 16:07:17 -08:00 · add3a08a6d
commit add3a08a6d
parent ebf1a533aa 9cdc4ae0ad
3 changed files with 26 additions and 10 deletions
--- a/plugin/pkg/scheduler/generic_scheduler.go
+++ b/plugin/pkg/scheduler/generic_scheduler.go
@ -17,7 +17,6 @@ limitations under the License.
 package scheduler

 import (
-	"bytes"
 	"fmt"
 	"sort"
 	"strings"
@ -45,10 +44,10 @@ type FitError struct {

 var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")

+const NoNodeAvailableMsg = "No nodes are available that match all of the following predicates:"
+
 // Error returns detailed information of why the pod failed to fit on each node
 func (f *FitError) Error() string {
-	var buf bytes.Buffer
-	buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
 	reasons := make(map[string]int)
 	for _, predicates := range f.FailedPredicates {
 		for _, pred := range predicates {
@ -64,10 +63,8 @@ func (f *FitError) Error() string {
 		sort.Strings(reasonStrings)
 		return reasonStrings
 	}
-
-	reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
-	buf.WriteString(reasonMsg)
-	return buf.String()
+	reasonMsg := fmt.Sprintf(NoNodeAvailableMsg+": %v.", strings.Join(sortReasonsHistogram(), ", "))
+	return reasonMsg
 }

 type genericScheduler struct {
--- a/plugin/pkg/scheduler/generic_scheduler_test.go
+++ b/plugin/pkg/scheduler/generic_scheduler_test.go
@ -21,6 +21,7 @@ import (
 	"math"
 	"reflect"
 	"strconv"
+	"strings"
 	"testing"
 	"time"

@ -397,6 +398,23 @@ func makeNode(node string, milliCPU, memory int64) *v1.Node {
 	}
 }

+func TestHumanReadableFitError(t *testing.T) {
+	error := &FitError{
+		Pod: &v1.Pod{ObjectMeta: v1.ObjectMeta{Name: "2"}},
+		FailedPredicates: FailedPredicateMap{
+			"1": []algorithm.PredicateFailureReason{algorithmpredicates.ErrNodeUnderMemoryPressure},
+			"2": []algorithm.PredicateFailureReason{algorithmpredicates.ErrNodeUnderDiskPressure},
+			"3": []algorithm.PredicateFailureReason{algorithmpredicates.ErrNodeUnderDiskPressure},
+		},
+	}
+	if strings.Contains(error.Error(), "No nodes are available that match all of the following predicates") {
+		if strings.Contains(error.Error(), "NodeUnderDiskPressure (2)") && strings.Contains(error.Error(), "NodeUnderMemoryPressure (1)") {
+			return
+		}
+	}
+	t.Errorf("Error message doesn't have all the information content: [" + error.Error() + "]")
+}
+
 // The point of this test is to show that you:
 // - get the same priority for a zero-request pod as for a pod with the defaults requests,
 //   both when the zero-request pod is already on the machine and when the zero-request pod
--- a/plugin/pkg/scheduler/scheduler.go
+++ b/plugin/pkg/scheduler/scheduler.go
@ -98,9 +98,10 @@ func (s *Scheduler) scheduleOne() {
 		s.config.Error(pod, err)
 		s.config.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", "%v", err)
 		s.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
-			Type:   v1.PodScheduled,
-			Status: v1.ConditionFalse,
-			Reason: v1.PodReasonUnschedulable,
+			Type:    v1.PodScheduled,
+			Status:  v1.ConditionFalse,
+			Reason:  v1.PodReasonUnschedulable,
+			Message: err.Error(),
 		})
 		return
 	}