Merge pull request #35932 from jayunit100/sched_events_spam_reduce

Automatic merge from submit-queue

Reduce spam in Events from scheduler by counter aggregation of failure

Fixes #35842
Part of overall #35555
This commit is contained in:
Kubernetes Submit Queue 2016-11-06 17:48:31 -08:00 committed by GitHub
commit f715b26d9c
2 changed files with 62 additions and 34 deletions

View File

@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
func (f *FitError) Error() string {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
for node, predicates := range f.FailedPredicates {
reasons := make([]string, 0)
reasons := make(map[string]int)
for _, predicates := range f.FailedPredicates {
for _, pred := range predicates {
reasons = append(reasons, pred.GetReason())
reasons[pred.GetReason()] += 1
}
reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", "))
buf.WriteString(reasonMsg)
}
sortReasonsHistogram := func() []string {
reasonStrings := []string{}
for k, v := range reasons {
reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v))
}
sort.Strings(reasonStrings)
return reasonStrings
}
reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
buf.WriteString(reasonMsg)
return buf.String()
}

View File

@ -18,6 +18,7 @@ package scheduler
import (
"errors"
"fmt"
"reflect"
"testing"
"time"
@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
defer close(stop)
queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
scache := schedulercache.New(10*time.Minute, stop)
node := api.Node{
ObjectMeta: api.ObjectMeta{Name: "machine1"},
Status: api.NodeStatus{
Capacity: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
},
Allocatable: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
}},
// Design the baseline for the pods, and we will make nodes that dont fit it later.
var cpu = int64(4)
var mem = int64(500)
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
}, api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
})
// create several nodes which cannot schedule the above pod
nodes := []*api.Node{}
for i := 0; i < 100; i++ {
node := api.Node{
ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)},
Status: api.NodeStatus{
Capacity: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
},
Allocatable: api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
}},
}
scache.AddNode(&node)
nodes = append(nodes, &node)
}
scache.AddNode(&node)
nodeLister := algorithm.FakeNodeLister([]*api.Node{&node})
nodeLister := algorithm.FakeNodeLister(nodes)
predicateMap := map[string]algorithm.FitPredicate{
"PodFitsResources": predicates.PodFitsResources,
}
// Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary.
failedPredicatesMap := FailedPredicateMap{}
for _, node := range nodes {
failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
}
}
scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
}, api.ResourceList{
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
})
queuedPodStore.Add(podWithTooBigResourceRequests)
scheduler.scheduleOne()
select {
case err := <-errChan:
expectErr := &FitError{
Pod: podWithTooBigResourceRequests,
FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
}},
Pod: podWithTooBigResourceRequests,
FailedPredicates: failedPredicatesMap,
}
if len(fmt.Sprint(expectErr)) > 150 {
t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr)))
}
if !reflect.DeepEqual(expectErr, err) {
t.Errorf("err want=%+v, get=%+v", expectErr, err)
t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err)
}
case <-time.After(wait.ForeverTestTimeout):
t.Fatalf("timeout after %v", wait.ForeverTestTimeout)