Merge pull request #35932 from jayunit100/sched_events_spam_reduce

Automatic merge from submit-queue Reduce spam in Events from scheduler by counter aggregation of failure Fixes #35842 Part of overall #35555
2025-09-23 19:08:44 +00:00 · 2016-11-06 17:48:31 -08:00
parent 9534c4f563 5d5bc6759e
commit f715b26d9c
2 changed files with 62 additions and 34 deletions
--- a/plugin/pkg/scheduler/generic_scheduler.go
+++ b/plugin/pkg/scheduler/generic_scheduler.go
@@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
 func (f *FitError) Error() string {
 	var buf bytes.Buffer
 	buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
-	for node, predicates := range f.FailedPredicates {
-		reasons := make([]string, 0)
+	reasons := make(map[string]int)
+	for _, predicates := range f.FailedPredicates {
 		for _, pred := range predicates {
-			reasons = append(reasons, pred.GetReason())
+			reasons[pred.GetReason()] += 1
 		}
-		reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", "))
-		buf.WriteString(reasonMsg)
 	}
+
+	sortReasonsHistogram := func() []string {
+		reasonStrings := []string{}
+		for k, v := range reasons {
+			reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v))
+		}
+		sort.Strings(reasonStrings)
+		return reasonStrings
+	}
+
+	reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
+	buf.WriteString(reasonMsg)
 	return buf.String()
 }

--- a/plugin/pkg/scheduler/scheduler_test.go
+++ b/plugin/pkg/scheduler/scheduler_test.go
@@ -18,6 +18,7 @@ package scheduler

 import (
 	"errors"
+	"fmt"
 	"reflect"
 	"testing"
 	"time"
@@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
 	defer close(stop)
 	queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
 	scache := schedulercache.New(10*time.Minute, stop)
-	node := api.Node{
-		ObjectMeta: api.ObjectMeta{Name: "machine1"},
-		Status: api.NodeStatus{
-			Capacity: api.ResourceList{
-				api.ResourceCPU:    *(resource.NewQuantity(2, resource.DecimalSI)),
-				api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
-				api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
-			},
-			Allocatable: api.ResourceList{
-				api.ResourceCPU:    *(resource.NewQuantity(2, resource.DecimalSI)),
-				api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
-				api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
-			}},
+
+	// Design the baseline for the pods, and we will make nodes that dont fit it later.
+	var cpu = int64(4)
+	var mem = int64(500)
+	podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
+		api.ResourceCPU:    *(resource.NewQuantity(cpu, resource.DecimalSI)),
+		api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
+	}, api.ResourceList{
+		api.ResourceCPU:    *(resource.NewQuantity(cpu, resource.DecimalSI)),
+		api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
+	})
+
+	// create several nodes which cannot schedule the above pod
+	nodes := []*api.Node{}
+	for i := 0; i < 100; i++ {
+		node := api.Node{
+			ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)},
+			Status: api.NodeStatus{
+				Capacity: api.ResourceList{
+					api.ResourceCPU:    *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
+					api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
+					api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
+				},
+				Allocatable: api.ResourceList{
+					api.ResourceCPU:    *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
+					api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
+					api.ResourcePods:   *(resource.NewQuantity(10, resource.DecimalSI)),
+				}},
+		}
+		scache.AddNode(&node)
+		nodes = append(nodes, &node)
 	}
-	scache.AddNode(&node)
-	nodeLister := algorithm.FakeNodeLister([]*api.Node{&node})
+	nodeLister := algorithm.FakeNodeLister(nodes)
 	predicateMap := map[string]algorithm.FitPredicate{
 		"PodFitsResources": predicates.PodFitsResources,
 	}

+	// Create expected failure reasons for all the nodes.  Hopefully they will get rolled up into a non-spammy summary.
+	failedPredicatesMap := FailedPredicateMap{}
+	for _, node := range nodes {
+		failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{
+			predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
+			predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
+		}
+	}
 	scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)

-	podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
-		api.ResourceCPU:    *(resource.NewQuantity(4, resource.DecimalSI)),
-		api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
-	}, api.ResourceList{
-		api.ResourceCPU:    *(resource.NewQuantity(4, resource.DecimalSI)),
-		api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
-	})
 	queuedPodStore.Add(podWithTooBigResourceRequests)
 	scheduler.scheduleOne()
-
 	select {
 	case err := <-errChan:
 		expectErr := &FitError{
-			Pod: podWithTooBigResourceRequests,
-			FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{
-				predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
-				predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
-			}},
+			Pod:              podWithTooBigResourceRequests,
+			FailedPredicates: failedPredicatesMap,
+		}
+		if len(fmt.Sprint(expectErr)) > 150 {
+			t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr)))
 		}
 		if !reflect.DeepEqual(expectErr, err) {
-			t.Errorf("err want=%+v, get=%+v", expectErr, err)
+			t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err)
 		}
 	case <-time.After(wait.ForeverTestTimeout):
 		t.Fatalf("timeout after %v", wait.ForeverTestTimeout)