mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Reduce spam in Events from scheduler by counter aggregation of failure
reasons.
This commit is contained in:
parent
e9afbd5cdf
commit
5d5bc6759e
@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
|
|||||||
func (f *FitError) Error() string {
|
func (f *FitError) Error() string {
|
||||||
var buf bytes.Buffer
|
var buf bytes.Buffer
|
||||||
buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
|
buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
|
||||||
for node, predicates := range f.FailedPredicates {
|
reasons := make(map[string]int)
|
||||||
reasons := make([]string, 0)
|
for _, predicates := range f.FailedPredicates {
|
||||||
for _, pred := range predicates {
|
for _, pred := range predicates {
|
||||||
reasons = append(reasons, pred.GetReason())
|
reasons[pred.GetReason()] += 1
|
||||||
}
|
}
|
||||||
reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", "))
|
}
|
||||||
|
|
||||||
|
sortReasonsHistogram := func() []string {
|
||||||
|
reasonStrings := []string{}
|
||||||
|
for k, v := range reasons {
|
||||||
|
reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v))
|
||||||
|
}
|
||||||
|
sort.Strings(reasonStrings)
|
||||||
|
return reasonStrings
|
||||||
|
}
|
||||||
|
|
||||||
|
reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
|
||||||
buf.WriteString(reasonMsg)
|
buf.WriteString(reasonMsg)
|
||||||
}
|
|
||||||
return buf.String()
|
return buf.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ package scheduler
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"reflect"
|
"reflect"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
|
|||||||
defer close(stop)
|
defer close(stop)
|
||||||
queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
|
queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
|
||||||
scache := schedulercache.New(10*time.Minute, stop)
|
scache := schedulercache.New(10*time.Minute, stop)
|
||||||
|
|
||||||
|
// Design the baseline for the pods, and we will make nodes that dont fit it later.
|
||||||
|
var cpu = int64(4)
|
||||||
|
var mem = int64(500)
|
||||||
|
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
|
||||||
|
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
|
||||||
|
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
|
||||||
|
}, api.ResourceList{
|
||||||
|
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
|
||||||
|
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
|
||||||
|
})
|
||||||
|
|
||||||
|
// create several nodes which cannot schedule the above pod
|
||||||
|
nodes := []*api.Node{}
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
node := api.Node{
|
node := api.Node{
|
||||||
ObjectMeta: api.ObjectMeta{Name: "machine1"},
|
ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)},
|
||||||
Status: api.NodeStatus{
|
Status: api.NodeStatus{
|
||||||
Capacity: api.ResourceList{
|
Capacity: api.ResourceList{
|
||||||
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
|
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
|
||||||
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
|
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
|
||||||
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
||||||
},
|
},
|
||||||
Allocatable: api.ResourceList{
|
Allocatable: api.ResourceList{
|
||||||
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
|
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
|
||||||
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
|
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
|
||||||
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
scache.AddNode(&node)
|
scache.AddNode(&node)
|
||||||
nodeLister := algorithm.FakeNodeLister([]*api.Node{&node})
|
nodes = append(nodes, &node)
|
||||||
|
}
|
||||||
|
nodeLister := algorithm.FakeNodeLister(nodes)
|
||||||
predicateMap := map[string]algorithm.FitPredicate{
|
predicateMap := map[string]algorithm.FitPredicate{
|
||||||
"PodFitsResources": predicates.PodFitsResources,
|
"PodFitsResources": predicates.PodFitsResources,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary.
|
||||||
|
failedPredicatesMap := FailedPredicateMap{}
|
||||||
|
for _, node := range nodes {
|
||||||
|
failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{
|
||||||
|
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
|
||||||
|
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
|
||||||
|
}
|
||||||
|
}
|
||||||
scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)
|
scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)
|
||||||
|
|
||||||
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
|
|
||||||
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
|
|
||||||
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
|
|
||||||
}, api.ResourceList{
|
|
||||||
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
|
|
||||||
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
|
|
||||||
})
|
|
||||||
queuedPodStore.Add(podWithTooBigResourceRequests)
|
queuedPodStore.Add(podWithTooBigResourceRequests)
|
||||||
scheduler.scheduleOne()
|
scheduler.scheduleOne()
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case err := <-errChan:
|
case err := <-errChan:
|
||||||
expectErr := &FitError{
|
expectErr := &FitError{
|
||||||
Pod: podWithTooBigResourceRequests,
|
Pod: podWithTooBigResourceRequests,
|
||||||
FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{
|
FailedPredicates: failedPredicatesMap,
|
||||||
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
|
}
|
||||||
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
|
if len(fmt.Sprint(expectErr)) > 150 {
|
||||||
}},
|
t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr)))
|
||||||
}
|
}
|
||||||
if !reflect.DeepEqual(expectErr, err) {
|
if !reflect.DeepEqual(expectErr, err) {
|
||||||
t.Errorf("err want=%+v, get=%+v", expectErr, err)
|
t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err)
|
||||||
}
|
}
|
||||||
case <-time.After(wait.ForeverTestTimeout):
|
case <-time.After(wait.ForeverTestTimeout):
|
||||||
t.Fatalf("timeout after %v", wait.ForeverTestTimeout)
|
t.Fatalf("timeout after %v", wait.ForeverTestTimeout)
|
||||||
|
Loading…
Reference in New Issue
Block a user