mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Merge pull request #35932 from jayunit100/sched_events_spam_reduce
Automatic merge from submit-queue Reduce spam in Events from scheduler by counter aggregation of failure Fixes #35842 Part of overall #35555
This commit is contained in:
commit
f715b26d9c
@ -49,14 +49,24 @@ var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods")
|
||||
func (f *FitError) Error() string {
|
||||
var buf bytes.Buffer
|
||||
buf.WriteString(fmt.Sprintf("pod (%s) failed to fit in any node\n", f.Pod.Name))
|
||||
for node, predicates := range f.FailedPredicates {
|
||||
reasons := make([]string, 0)
|
||||
reasons := make(map[string]int)
|
||||
for _, predicates := range f.FailedPredicates {
|
||||
for _, pred := range predicates {
|
||||
reasons = append(reasons, pred.GetReason())
|
||||
reasons[pred.GetReason()] += 1
|
||||
}
|
||||
reasonMsg := fmt.Sprintf("fit failure on node (%s): %s\n", node, strings.Join(reasons, ", "))
|
||||
buf.WriteString(reasonMsg)
|
||||
}
|
||||
|
||||
sortReasonsHistogram := func() []string {
|
||||
reasonStrings := []string{}
|
||||
for k, v := range reasons {
|
||||
reasonStrings = append(reasonStrings, fmt.Sprintf("%v (%v)", k, v))
|
||||
}
|
||||
sort.Strings(reasonStrings)
|
||||
return reasonStrings
|
||||
}
|
||||
|
||||
reasonMsg := fmt.Sprintf("fit failure summary on nodes : %v", strings.Join(sortReasonsHistogram(), ", "))
|
||||
buf.WriteString(reasonMsg)
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@ package scheduler
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
@ -331,49 +332,66 @@ func TestSchedulerFailedSchedulingReasons(t *testing.T) {
|
||||
defer close(stop)
|
||||
queuedPodStore := clientcache.NewFIFO(clientcache.MetaNamespaceKeyFunc)
|
||||
scache := schedulercache.New(10*time.Minute, stop)
|
||||
node := api.Node{
|
||||
ObjectMeta: api.ObjectMeta{Name: "machine1"},
|
||||
Status: api.NodeStatus{
|
||||
Capacity: api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
|
||||
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
||||
},
|
||||
Allocatable: api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(2, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(100, resource.DecimalSI)),
|
||||
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
||||
}},
|
||||
|
||||
// Design the baseline for the pods, and we will make nodes that dont fit it later.
|
||||
var cpu = int64(4)
|
||||
var mem = int64(500)
|
||||
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
|
||||
}, api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(cpu, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(mem, resource.DecimalSI)),
|
||||
})
|
||||
|
||||
// create several nodes which cannot schedule the above pod
|
||||
nodes := []*api.Node{}
|
||||
for i := 0; i < 100; i++ {
|
||||
node := api.Node{
|
||||
ObjectMeta: api.ObjectMeta{Name: fmt.Sprintf("machine%v", i)},
|
||||
Status: api.NodeStatus{
|
||||
Capacity: api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
|
||||
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
||||
},
|
||||
Allocatable: api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(cpu/2, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(mem/5, resource.DecimalSI)),
|
||||
api.ResourcePods: *(resource.NewQuantity(10, resource.DecimalSI)),
|
||||
}},
|
||||
}
|
||||
scache.AddNode(&node)
|
||||
nodes = append(nodes, &node)
|
||||
}
|
||||
scache.AddNode(&node)
|
||||
nodeLister := algorithm.FakeNodeLister([]*api.Node{&node})
|
||||
nodeLister := algorithm.FakeNodeLister(nodes)
|
||||
predicateMap := map[string]algorithm.FitPredicate{
|
||||
"PodFitsResources": predicates.PodFitsResources,
|
||||
}
|
||||
|
||||
// Create expected failure reasons for all the nodes. Hopefully they will get rolled up into a non-spammy summary.
|
||||
failedPredicatesMap := FailedPredicateMap{}
|
||||
for _, node := range nodes {
|
||||
failedPredicatesMap[node.Name] = []algorithm.PredicateFailureReason{
|
||||
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
|
||||
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
|
||||
}
|
||||
}
|
||||
scheduler, _, errChan := setupTestScheduler(queuedPodStore, scache, nodeLister, predicateMap)
|
||||
|
||||
podWithTooBigResourceRequests := podWithResources("bar", "", api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
|
||||
}, api.ResourceList{
|
||||
api.ResourceCPU: *(resource.NewQuantity(4, resource.DecimalSI)),
|
||||
api.ResourceMemory: *(resource.NewQuantity(500, resource.DecimalSI)),
|
||||
})
|
||||
queuedPodStore.Add(podWithTooBigResourceRequests)
|
||||
scheduler.scheduleOne()
|
||||
|
||||
select {
|
||||
case err := <-errChan:
|
||||
expectErr := &FitError{
|
||||
Pod: podWithTooBigResourceRequests,
|
||||
FailedPredicates: FailedPredicateMap{node.Name: []algorithm.PredicateFailureReason{
|
||||
predicates.NewInsufficientResourceError(api.ResourceCPU, 4000, 0, 2000),
|
||||
predicates.NewInsufficientResourceError(api.ResourceMemory, 500, 0, 100),
|
||||
}},
|
||||
Pod: podWithTooBigResourceRequests,
|
||||
FailedPredicates: failedPredicatesMap,
|
||||
}
|
||||
if len(fmt.Sprint(expectErr)) > 150 {
|
||||
t.Errorf("message is too spammy ! %v ", len(fmt.Sprint(expectErr)))
|
||||
}
|
||||
if !reflect.DeepEqual(expectErr, err) {
|
||||
t.Errorf("err want=%+v, get=%+v", expectErr, err)
|
||||
t.Errorf("\n err \nWANT=%+v,\nGOT=%+v", expectErr, err)
|
||||
}
|
||||
case <-time.After(wait.ForeverTestTimeout):
|
||||
t.Fatalf("timeout after %v", wait.ForeverTestTimeout)
|
||||
|
Loading…
Reference in New Issue
Block a user