diff --git a/plugin/pkg/scheduler/generic_scheduler.go b/plugin/pkg/scheduler/generic_scheduler.go index ba507f0b3fa..3cd648a8cde 100644 --- a/plugin/pkg/scheduler/generic_scheduler.go +++ b/plugin/pkg/scheduler/generic_scheduler.go @@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods") func (f *FitError) Error() string { var buf bytes.Buffer buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name)) - for node, predicates := range f.FailedPredicates { - reasons := make([]string, 0) + reasons := make(map[string]int) + for _, predicates := range f.FailedPredicates { for _, pred := range predicates { - reasons = append(reasons, pred.GetReason()) + reasons[pred.GetReason()] += 1 } - reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", ")) - buf.WriteString(reasonMsg) } + + sortReasonsHistogram := func() []string { + reasonStrings := []string{} + for k, v := range reasons { + reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v)) + } + sort.Strings(reasonStrings) + return reasonStrings + } + + reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", ")) + buf.WriteString(reasonMsg) return buf.String() } diff --git a/plugin/pkg/scheduler/scheduler_test.go b/plugin/pkg/scheduler/scheduler_test.go index 6ba817cca74..1cb118ee8c4 100644 --- a/plugin/pkg/scheduler/scheduler_test.go +++ b/plugin/pkg/scheduler/scheduler_test.go @@ -18,6 +18,7 @@ package scheduler import ( "errors" + "fmt" "reflect" "testing" "time" @@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) { defer close(stop) queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc) scache := schedulercache.New(10*time.Minute, stop) - node := api.Node{ - ObjectMeta: api.ObjectMeta{Name: "machine1"}, - Status: api.NodeStatus{ - Capacity: api.ResourceList{ - api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)), - api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)), - api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), - }, - Allocatable: api.ResourceList{ - api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)), - api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)), - api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), - }}, + + // Design the baseline for the pods, and we will make nodes that dont fit it later. + var cpu = int64(4) + var mem = int64(500) + podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{ + api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), + api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), + }, api.ResourceList{ + api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)), + api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)), + }) + + // create several nodes which cannot schedule the above pod + nodes := []*api.Node{} + for i := 0; i < 100; i++ { + node := api.Node{ + ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)}, + Status: api.NodeStatus{ + Capacity: api.ResourceList{ + api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), + api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), + api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), + }, + Allocatable: api.ResourceList{ + api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)), + api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)), + api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)), + }}, + } + scache.AddNode(&node) + nodes = append(nodes, &node) } - scache.AddNode(&node) - nodeLister := algorithm.FakeNodeLister([]*api.Node{&node}) + nodeLister := algorithm.FakeNodeLister(nodes) predicateMap := map[string]algorithm.FitPredicate{ "PodFitsResources": predicates.PodFitsResources, } + // Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary. + failedPredicatesMap := FailedPredicateMap{} + for _, node := range nodes { + failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{ + predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000), + predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100), + } + } scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap) - podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{ - api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)), - api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)), - }, api.ResourceList{ - api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)), - api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)), - }) queuedPodStore.Add(podWithTooBigResourceRequests) scheduler.scheduleOne() - select { case err := <-errChan: expectErr := &FitError{ - Pod: podWithTooBigResourceRequests, - FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{ - predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000), - predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100), - }}, + Pod: podWithTooBigResourceRequests, + FailedPredicates: failedPredicatesMap, + } + if len(fmt.Sprint(expectErr)) > 150 { + t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr))) } if !reflect.DeepEqual(expectErr, err) { - t.Errorf("err want=%+v, get=%+v", expectErr, err) + t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err) } case <-time.After(wait.ForeverTestTimeout): t.Fatalf("timeout after %v", wait.ForeverTestTimeout)