scheduler_perf: show name of one pending pod in error message

If pods get stuck, then giving the name of one makes it possible
to search for it in the log output. Without the name it's hard
to figure out which pods got stuck.
This commit is contained in:
Patrick Ohly 2023-09-01 08:30:13 +02:00
parent 6eca142082
commit c74d045c4b
2 changed files with 28 additions and 10 deletions

View File

@ -50,6 +50,7 @@ import (
"k8s.io/component-base/featuregate"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
@ -1312,13 +1313,15 @@ func createPods(ctx context.Context, tb testing.TB, namespace string, cpo *creat
// namespace are scheduled. Times out after 10 minutes because even at the
// lowest observed QPS of ~10 pods/sec, a 5000-node test should complete.
func waitUntilPodsScheduledInNamespace(ctx context.Context, tb testing.TB, podInformer coreinformers.PodInformer, namespace string, wantCount int) error {
return wait.PollUntilContextTimeout(ctx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
var pendingPod *v1.Pod
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
select {
case <-ctx.Done():
return true, ctx.Err()
default:
}
scheduled, err := getScheduledPods(podInformer, namespace)
scheduled, unscheduled, err := getScheduledPods(podInformer, namespace)
if err != nil {
return false, err
}
@ -1327,8 +1330,18 @@ func waitUntilPodsScheduledInNamespace(ctx context.Context, tb testing.TB, podIn
return true, nil
}
tb.Logf("namespace: %s, pods: want %d, got %d", namespace, wantCount, len(scheduled))
if len(unscheduled) > 0 {
pendingPod = unscheduled[0]
} else {
pendingPod = nil
}
return false, nil
})
if err != nil && pendingPod != nil {
err = fmt.Errorf("at least pod %s is not scheduled: %v", klog.KObj(pendingPod), err)
}
return err
}
// waitUntilPodsScheduled blocks until the all pods in the given namespaces are

View File

@ -147,23 +147,28 @@ func mustSetupCluster(ctx context.Context, tb testing.TB, config *config.KubeSch
return informerFactory, client, dynClient
}
// Returns the list of scheduled pods in the specified namespaces.
// Returns the list of scheduled and unscheduled pods in the specified namespaces.
// Note that no namespaces specified matches all namespaces.
func getScheduledPods(podInformer coreinformers.PodInformer, namespaces ...string) ([]*v1.Pod, error) {
func getScheduledPods(podInformer coreinformers.PodInformer, namespaces ...string) ([]*v1.Pod, []*v1.Pod, error) {
pods, err := podInformer.Lister().List(labels.Everything())
if err != nil {
return nil, err
return nil, nil, err
}
s := sets.New(namespaces...)
scheduled := make([]*v1.Pod, 0, len(pods))
unscheduled := make([]*v1.Pod, 0, len(pods))
for i := range pods {
pod := pods[i]
if len(pod.Spec.NodeName) > 0 && (len(s) == 0 || s.Has(pod.Namespace)) {
scheduled = append(scheduled, pod)
if len(s) == 0 || s.Has(pod.Namespace) {
if len(pod.Spec.NodeName) > 0 {
scheduled = append(scheduled, pod)
} else {
unscheduled = append(unscheduled, pod)
}
}
}
return scheduled, nil
return scheduled, unscheduled, nil
}
// DataItem is the data point.
@ -355,7 +360,7 @@ func newThroughputCollector(tb testing.TB, podInformer coreinformers.PodInformer
}
func (tc *throughputCollector) run(ctx context.Context) {
podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...)
podsScheduled, _, err := getScheduledPods(tc.podInformer, tc.namespaces...)
if err != nil {
klog.Fatalf("%v", err)
}
@ -372,7 +377,7 @@ func (tc *throughputCollector) run(ctx context.Context) {
return
case <-ticker.C:
now := time.Now()
podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...)
podsScheduled, _, err := getScheduledPods(tc.podInformer, tc.namespaces...)
if err != nil {
klog.Fatalf("%v", err)
}