From e43065d542da37f7b7be87457e6ce0538d286882 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Tue, 15 Oct 2024 10:03:05 +0200 Subject: [PATCH] e2e daemon set: better polling in CheckDaemonStatus As a quick fix for a flake, bceec5a3ffd6975105f95b08641e45644c0a6753 introduced polling with wait.Poll in all callers of CheckDaemonStatus. This commit reverts all callers to what they were before (CheckDaemonStatus + ExpectNoError) and implements polling according to E2E best practices (https://github.com/kubernetes/community/blob/master/contributors/devel/sig-testing/writing-good-e2e-tests.md#polling-and-timeouts): - no logging while polling - support for progress reporting while polling - last but not least, produce an informative failure message in case of a timeout, including a dump of the daemon set as YAML --- test/e2e/apps/controller_revision.go | 4 +-- test/e2e/apps/daemon_set.go | 28 ++++++++++----------- test/e2e/framework/daemonset/fixtures.go | 32 +++++++++++++----------- test/e2e/network/loadbalancer.go | 4 +-- test/e2e/upgrades/apps/daemonsets.go | 4 +-- 5 files changed, 38 insertions(+), 34 deletions(-) diff --git a/test/e2e/apps/controller_revision.go b/test/e2e/apps/controller_revision.go index 4feb4722292..f4194c92cb3 100644 --- a/test/e2e/apps/controller_revision.go +++ b/test/e2e/apps/controller_revision.go @@ -136,8 +136,8 @@ var _ = SIGDescribe("ControllerRevision", framework.WithSerial(), func() { ginkgo.By("Check that daemon pods launch on every node of the cluster.") err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkRunningOnAllNodes(f, testDaemonset)) framework.ExpectNoError(err, "error waiting for daemon pod to start") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By(fmt.Sprintf("Confirm DaemonSet %q successfully created with %q label", dsName, dsLabelSelector)) dsList, err := csAppsV1.DaemonSets("").List(ctx, metav1.ListOptions{LabelSelector: dsLabelSelector}) diff --git a/test/e2e/apps/daemon_set.go b/test/e2e/apps/daemon_set.go index b29654acc74..ad352fbb940 100644 --- a/test/e2e/apps/daemon_set.go +++ b/test/e2e/apps/daemon_set.go @@ -184,8 +184,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { ginkgo.By("Check that daemon pods launch on every node of the cluster.") err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkRunningOnAllNodes(f, ds)) framework.ExpectNoError(err, "error waiting for daemon pod to start") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By("Stop a daemon pod, check that the daemon pod is revived.") podList := listDaemonPods(ctx, c, ns, label) @@ -224,8 +224,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { gomega.Expect(daemonSetLabels).To(gomega.HaveLen(1)) err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonPodOnNodes(f, ds, []string{newNode.Name})) framework.ExpectNoError(err, "error waiting for daemon pods to be running on new nodes") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By("Update the node label to green, and wait for daemons to be unscheduled") nodeSelector[daemonsetColorLabel] = "green" @@ -243,8 +243,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { gomega.Expect(daemonSetLabels).To(gomega.HaveLen(1)) err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonPodOnNodes(f, ds, []string{greenNode.Name})) framework.ExpectNoError(err, "error waiting for daemon pods to be running on new nodes") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) }) // We defer adding this test to conformance pending the disposition of moving DaemonSet scheduling logic to the @@ -287,8 +287,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { gomega.Expect(daemonSetLabels).To(gomega.HaveLen(1)) err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonPodOnNodes(f, ds, []string{newNode.Name})) framework.ExpectNoError(err, "error waiting for daemon pods to be running on new nodes") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By("Remove the node label and wait for daemons to be unscheduled") _, err = setDaemonSetNodeLabels(ctx, c, node.Name, map[string]string{}) @@ -312,8 +312,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { ginkgo.By("Check that daemon pods launch on every node of the cluster.") err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkRunningOnAllNodes(f, ds)) framework.ExpectNoError(err, "error waiting for daemon pod to start") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By("Set a daemon pod's phase to 'Failed', check that the daemon pod is revived.") podList := listDaemonPods(ctx, c, ns, label) @@ -863,8 +863,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { ginkgo.By("Check that daemon pods launch on every node of the cluster.") err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkRunningOnAllNodes(f, testDaemonset)) framework.ExpectNoError(err, "error waiting for daemon pod to start") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By("listing all DaemonSets") dsList, err := cs.AppsV1().DaemonSets("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) @@ -911,8 +911,8 @@ var _ = SIGDescribe("Daemon set", framework.WithSerial(), func() { ginkgo.By("Check that daemon pods launch on every node of the cluster.") err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkRunningOnAllNodes(f, testDaemonset)) framework.ExpectNoError(err, "error waiting for daemon pod to start") - err = wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, dsName)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, dsName) + framework.ExpectNoError(err) ginkgo.By("Getting /status") dsResource := schema.GroupVersionResource{Group: "apps", Version: "v1", Resource: "daemonsets"} diff --git a/test/e2e/framework/daemonset/fixtures.go b/test/e2e/framework/daemonset/fixtures.go index dc2ba9c5cb0..5c1a5b54fc7 100644 --- a/test/e2e/framework/daemonset/fixtures.go +++ b/test/e2e/framework/daemonset/fixtures.go @@ -18,6 +18,7 @@ package daemonset import ( "context" + "fmt" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" @@ -26,6 +27,7 @@ import ( "k8s.io/kubectl/pkg/util/podutils" "k8s.io/kubernetes/pkg/controller/daemon" "k8s.io/kubernetes/test/e2e/framework" + "k8s.io/kubernetes/test/utils/format" ) func NewDaemonSet(dsName, image string, labels map[string]string, volumes []v1.Volume, mounts []v1.VolumeMount, ports []v1.ContainerPort, args ...string) *appsv1.DaemonSet { @@ -138,18 +140,20 @@ func checkDaemonPodStateOnNodes(ctx context.Context, c clientset.Interface, ds * return len(nodesToPodCount) == len(nodeNames), nil } -// CheckDaemonStatus returns false if not all desired pods are scheduled or not all of them are ready. -func CheckDaemonStatus(ctx context.Context, f *framework.Framework, dsName string) func(ctx context.Context) (bool, error) { - return func(ctx context.Context) (bool, error) { - ds, err := f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Get(ctx, dsName, metav1.GetOptions{}) - if err != nil { - return false, err - } - desired, scheduled, ready := ds.Status.DesiredNumberScheduled, ds.Status.CurrentNumberScheduled, ds.Status.NumberReady - if desired == scheduled && scheduled == ready { - return true, nil - } - framework.Logf("error in daemon status. DesiredScheduled: %d, CurrentScheduled: %d, Ready: %d", desired, scheduled, ready) - return false, nil - } +// CheckDaemonStatus ensures that eventually the daemon set has the desired +// number of pods scheduled and ready. It returns a descriptive error if that +// state is not reached in the amount of time it takes to start +// pods. f.Timeouts.PodStart can be changed to influence that timeout. +func CheckDaemonStatus(ctx context.Context, f *framework.Framework, dsName string) error { + return framework.Gomega().Eventually(ctx, framework.GetObject(f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Get, dsName, metav1.GetOptions{})). + WithTimeout(f.Timeouts.PodStart). + Should(framework.MakeMatcher(func(ds *appsv1.DaemonSet) (failure func() string, err error) { + desired, scheduled, ready := ds.Status.DesiredNumberScheduled, ds.Status.CurrentNumberScheduled, ds.Status.NumberReady + if desired == scheduled && scheduled == ready { + return nil, nil + } + return func() string { + return fmt.Sprintf("Expected daemon set to reach state where all desired pods are scheduled and ready. Got instead DesiredScheduled: %d, CurrentScheduled: %d, Ready: %d\n%s", desired, scheduled, ready, format.Object(ds, 1)) + }, nil + })) } diff --git a/test/e2e/network/loadbalancer.go b/test/e2e/network/loadbalancer.go index 6158c022fd8..5612a4b89d4 100644 --- a/test/e2e/network/loadbalancer.go +++ b/test/e2e/network/loadbalancer.go @@ -1307,8 +1307,8 @@ func testRollingUpdateLBConnectivityDisruption(ctx context.Context, f *framework creationTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(ctx, cs) err = wait.PollUntilContextTimeout(ctx, framework.Poll, creationTimeout, true, e2edaemonset.CheckDaemonPodOnNodes(f, ds, nodeNames)) framework.ExpectNoError(err, "error waiting for daemon pods to start") - err = wait.PollUntilContextTimeout(ctx, framework.Poll, creationTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, name)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, name) + framework.ExpectNoError(err) ginkgo.By(fmt.Sprintf("Creating a service %s with type=LoadBalancer externalTrafficPolicy=%s in namespace %s", name, externalTrafficPolicy, ns)) jig := e2eservice.NewTestJig(cs, ns, name) diff --git a/test/e2e/upgrades/apps/daemonsets.go b/test/e2e/upgrades/apps/daemonsets.go index e5e32bec23b..22739cff354 100644 --- a/test/e2e/upgrades/apps/daemonsets.go +++ b/test/e2e/upgrades/apps/daemonsets.go @@ -95,7 +95,7 @@ func (t *DaemonSetUpgradeTest) validateRunningDaemonSet(ctx context.Context, f * // DaemonSet resource itself should be good ginkgo.By("confirming the DaemonSet resource is in a good state") - err = wait.PollUntilContextTimeout(ctx, framework.Poll, framework.PodStartTimeout, true, e2edaemonset.CheckDaemonStatus(ctx, f, t.daemonSet.Name)) - framework.ExpectNoError(err, "error waiting for daemonset to report all pods are scheduled and ready") + err = e2edaemonset.CheckDaemonStatus(ctx, f, t.daemonSet.Name) + framework.ExpectNoError(err) }