scheduler_perf: show name of one pending pod in error message

If pods get stuck, then giving the name of one makes it possible
to search for it in the log output. Without the name it's hard
to figure out which pods got stuck.
This commit is contained in:
Patrick Ohly 2023-09-01 08:30:13 +02:00
parent 6eca142082
commit c74d045c4b
2 changed files with 28 additions and 10 deletions

View File

@ -50,6 +50,7 @@ import (
"k8s.io/component-base/featuregate" "k8s.io/component-base/featuregate"
featuregatetesting "k8s.io/component-base/featuregate/testing" featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config" "k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/scheme" "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation" "k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
@ -1312,13 +1313,15 @@ func createPods(ctx context.Context, tb testing.TB, namespace string, cpo *creat
// namespace are scheduled. Times out after 10 minutes because even at the // namespace are scheduled. Times out after 10 minutes because even at the
// lowest observed QPS of ~10 pods/sec, a 5000-node test should complete. // lowest observed QPS of ~10 pods/sec, a 5000-node test should complete.
func waitUntilPodsScheduledInNamespace(ctx context.Context, tb testing.TB, podInformer coreinformers.PodInformer, namespace string, wantCount int) error { func waitUntilPodsScheduledInNamespace(ctx context.Context, tb testing.TB, podInformer coreinformers.PodInformer, namespace string, wantCount int) error {
return wait.PollUntilContextTimeout(ctx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) { var pendingPod *v1.Pod
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return true, ctx.Err() return true, ctx.Err()
default: default:
} }
scheduled, err := getScheduledPods(podInformer, namespace) scheduled, unscheduled, err := getScheduledPods(podInformer, namespace)
if err != nil { if err != nil {
return false, err return false, err
} }
@ -1327,8 +1330,18 @@ func waitUntilPodsScheduledInNamespace(ctx context.Context, tb testing.TB, podIn
return true, nil return true, nil
} }
tb.Logf("namespace: %s, pods: want %d, got %d", namespace, wantCount, len(scheduled)) tb.Logf("namespace: %s, pods: want %d, got %d", namespace, wantCount, len(scheduled))
if len(unscheduled) > 0 {
pendingPod = unscheduled[0]
} else {
pendingPod = nil
}
return false, nil return false, nil
}) })
if err != nil && pendingPod != nil {
err = fmt.Errorf("at least pod %s is not scheduled: %v", klog.KObj(pendingPod), err)
}
return err
} }
// waitUntilPodsScheduled blocks until the all pods in the given namespaces are // waitUntilPodsScheduled blocks until the all pods in the given namespaces are

View File

@ -147,23 +147,28 @@ func mustSetupCluster(ctx context.Context, tb testing.TB, config *config.KubeSch
return informerFactory, client, dynClient return informerFactory, client, dynClient
} }
// Returns the list of scheduled pods in the specified namespaces. // Returns the list of scheduled and unscheduled pods in the specified namespaces.
// Note that no namespaces specified matches all namespaces. // Note that no namespaces specified matches all namespaces.
func getScheduledPods(podInformer coreinformers.PodInformer, namespaces ...string) ([]*v1.Pod, error) { func getScheduledPods(podInformer coreinformers.PodInformer, namespaces ...string) ([]*v1.Pod, []*v1.Pod, error) {
pods, err := podInformer.Lister().List(labels.Everything()) pods, err := podInformer.Lister().List(labels.Everything())
if err != nil { if err != nil {
return nil, err return nil, nil, err
} }
s := sets.New(namespaces...) s := sets.New(namespaces...)
scheduled := make([]*v1.Pod, 0, len(pods)) scheduled := make([]*v1.Pod, 0, len(pods))
unscheduled := make([]*v1.Pod, 0, len(pods))
for i := range pods { for i := range pods {
pod := pods[i] pod := pods[i]
if len(pod.Spec.NodeName) > 0 && (len(s) == 0 || s.Has(pod.Namespace)) { if len(s) == 0 || s.Has(pod.Namespace) {
if len(pod.Spec.NodeName) > 0 {
scheduled = append(scheduled, pod) scheduled = append(scheduled, pod)
} else {
unscheduled = append(unscheduled, pod)
} }
} }
return scheduled, nil }
return scheduled, unscheduled, nil
} }
// DataItem is the data point. // DataItem is the data point.
@ -355,7 +360,7 @@ func newThroughputCollector(tb testing.TB, podInformer coreinformers.PodInformer
} }
func (tc *throughputCollector) run(ctx context.Context) { func (tc *throughputCollector) run(ctx context.Context) {
podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...) podsScheduled, _, err := getScheduledPods(tc.podInformer, tc.namespaces...)
if err != nil { if err != nil {
klog.Fatalf("%v", err) klog.Fatalf("%v", err)
} }
@ -372,7 +377,7 @@ func (tc *throughputCollector) run(ctx context.Context) {
return return
case <-ticker.C: case <-ticker.C:
now := time.Now() now := time.Now()
podsScheduled, err := getScheduledPods(tc.podInformer, tc.namespaces...) podsScheduled, _, err := getScheduledPods(tc.podInformer, tc.namespaces...)
if err != nil { if err != nil {
klog.Fatalf("%v", err) klog.Fatalf("%v", err)
} }