From 52b1c777304770bf34671f447b08579e34be0445 Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Sun, 1 Sep 2024 20:55:25 -0400 Subject: [PATCH 1/2] Fix etcd failures in ci-kubernetes-e2e-cos-gce-disruptive-canary Signed-off-by: Davanum Srinivas --- test/e2e/apimachinery/etcd_failure.go | 28 ++++++--- test/e2e/apps/daemon_restart.go | 65 ++++++--------------- test/e2e/framework/util.go | 24 ++++++++ test/e2e_kubeadm/controlplane_nodes_test.go | 21 +------ 4 files changed, 67 insertions(+), 71 deletions(-) diff --git a/test/e2e/apimachinery/etcd_failure.go b/test/e2e/apimachinery/etcd_failure.go index b3d6b5feee7..3c212e06409 100644 --- a/test/e2e/apimachinery/etcd_failure.go +++ b/test/e2e/apimachinery/etcd_failure.go @@ -34,6 +34,7 @@ import ( admissionapi "k8s.io/pod-security-admission/api" "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" ) var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() { @@ -47,7 +48,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() { // - master access // ... so the provider check should be identical to the intersection of // providers that provide those capabilities. - e2eskipper.SkipUnlessProviderIs("gce") + e2eskipper.SkipUnlessProviderIs("gce", "aws") e2eskipper.SkipUnlessSSHKeyPresent() err := e2erc.RunRC(ctx, testutils.RCConfig{ @@ -80,7 +81,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() { }) func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) { - doEtcdFailure(ctx, failCommand, fixCommand) + doEtcdFailure(ctx, f, failCommand, fixCommand) checkExistingRCRecovers(ctx, f) @@ -94,17 +95,30 @@ func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixC // master and go on to assert that etcd and kubernetes components recover. const etcdFailureDuration = 20 * time.Second -func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) { +func doEtcdFailure(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) { ginkgo.By("failing etcd") - masterExec(ctx, failCommand) + masterExec(ctx, f, failCommand) time.Sleep(etcdFailureDuration) - masterExec(ctx, fixCommand) + masterExec(ctx, f, fixCommand) } -func masterExec(ctx context.Context, cmd string) { - host := framework.APIAddress() + ":22" +func masterExec(ctx context.Context, f *framework.Framework, cmd string) { + nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet) + // checks if there is at least one control-plane node + + gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), + "at least one node with label %s should exist.", framework.ControlPlaneLabel) + + ips := framework.GetNodeExternalIPs(&nodes.Items[0]) + gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.") + + host := ips[0] + ":22" result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider) + framework.ExpectNoError(err) + e2essh.LogResult(result) + + result, err = e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider) framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd) if result.Code != 0 { e2essh.LogResult(result) diff --git a/test/e2e/apps/daemon_restart.go b/test/e2e/apps/daemon_restart.go index 51c6e78d963..d674fa0e36a 100644 --- a/test/e2e/apps/daemon_restart.go +++ b/test/e2e/apps/daemon_restart.go @@ -19,6 +19,7 @@ package apps import ( "context" "fmt" + "github.com/onsi/gomega" "strconv" "time" @@ -40,7 +41,6 @@ import ( e2erc "k8s.io/kubernetes/test/e2e/framework/rc" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" - testfwk "k8s.io/kubernetes/test/integration/framework" testutils "k8s.io/kubernetes/test/utils" imageutils "k8s.io/kubernetes/test/utils/image" admissionapi "k8s.io/pod-security-admission/api" @@ -278,12 +278,18 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() { // Requires master ssh access. e2eskipper.SkipUnlessProviderIs("gce", "aws") - nodes, err := getControlPlaneNodes(ctx, f.ClientSet) - framework.ExpectNoError(err) + nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet) + + // checks if there is at least one control-plane node + gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel) + for i := range nodes.Items { + ips := framework.GetNodeExternalIPs(&nodes.Items[i]) + gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.") + restarter := NewRestartConfig( - getFirstIPforNode(&nodes.Items[i]), "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true) + ips[0], "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true) restarter.restart(ctx) // The intent is to ensure the replication controller manager has observed and reported status of @@ -313,11 +319,17 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() { ginkgo.It("Scheduler should continue assigning pods to nodes across restart", func(ctx context.Context) { // Requires master ssh access. e2eskipper.SkipUnlessProviderIs("gce", "aws") - nodes, err := getControlPlaneNodes(ctx, f.ClientSet) - framework.ExpectNoError(err) + nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet) + + // checks if there is at least one control-plane node + gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel) + for i := range nodes.Items { + ips := framework.GetNodeExternalIPs(&nodes.Items[i]) + gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.") + restarter := NewRestartConfig( - getFirstIPforNode(&nodes.Items[i]), "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true) + ips[0], "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true) // Create pods while the scheduler is down and make sure the scheduler picks them up by // scaling the rc to the same size. @@ -367,42 +379,3 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() { } }) }) - -func getFirstIPforNode(node *v1.Node) string { - var ips []string - ips = append(ips, getAddresses(node, v1.NodeExternalIP)...) - if len(ips) == 0 { - // If ExternalIP isn't set, assume the test programs can reach the InternalIP - ips = append(ips, getAddresses(node, v1.NodeInternalIP)...) - } - if len(ips) == 0 { - framework.Failf("did not find any ip(s) for node: %v", node) - } - return ips[0] -} - -func getAddresses(node *v1.Node, addressType v1.NodeAddressType) (ips []string) { - for j := range node.Status.Addresses { - nodeAddress := &node.Status.Addresses[j] - if nodeAddress.Type == addressType && nodeAddress.Address != "" { - ips = append(ips, nodeAddress.Address) - } - } - return -} - -func getControlPlaneNodes(ctx context.Context, c clientset.Interface) (nodes *v1.NodeList, err error) { - nodes, err = c.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return nil, err - } - testfwk.Filter(nodes, func(node v1.Node) bool { - _, isMaster := node.Labels["node-role.kubernetes.io/master"] - _, isControlPlane := node.Labels["node-role.kubernetes.io/control-plane"] - return isMaster || isControlPlane - }) - if len(nodes.Items) == 0 { - return nil, fmt.Errorf("there are currently no ready, schedulable control plane nodes in the cluster") - } - return nodes, nil -} diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index a3eb8a39ac9..37d48fafd79 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -40,6 +40,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/sets" @@ -128,6 +129,9 @@ const ( // SnapshotDeleteTimeout is how long for snapshot to delete snapshotContent. SnapshotDeleteTimeout = 5 * time.Minute + + // ControlPlaneLabel is valid for kubeadm based clusters like kops ONLY + ControlPlaneLabel = "node-role.kubernetes.io/control-plane" ) var ( @@ -662,6 +666,17 @@ func RunCmdEnv(env []string, command string, args ...string) (string, string, er return stdout, stderr, nil } +// GetNodeExternalIPs returns a list of external ip address(es) if any for a node +func GetNodeExternalIPs(node *v1.Node) (ips []string) { + for j := range node.Status.Addresses { + nodeAddress := &node.Status.Addresses[j] + if nodeAddress.Type == v1.NodeExternalIP && nodeAddress.Address != "" { + ips = append(ips, nodeAddress.Address) + } + } + return +} + // getControlPlaneAddresses returns the externalIP, internalIP and hostname fields of control plane nodes. // If any of these is unavailable, empty slices are returned. func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]string, []string, []string) { @@ -694,6 +709,15 @@ func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]str return externalIPs, internalIPs, hostnames } +// GetControlPlaneNodes returns a list of control plane nodes +func GetControlPlaneNodes(ctx context.Context, c clientset.Interface) *v1.NodeList { + selector := labels.Set{ControlPlaneLabel: ""}.AsSelector() + cpNodes, err := c.CoreV1().Nodes(). + List(ctx, metav1.ListOptions{LabelSelector: selector.String()}) + ExpectNoError(err, "error reading control-plane nodes") + return cpNodes +} + // GetControlPlaneAddresses returns all IP addresses on which the kubelet can reach the control plane. // It may return internal and external IPs, even if we expect for // e.g. internal IPs to be used (issue #56787), so that we can be diff --git a/test/e2e_kubeadm/controlplane_nodes_test.go b/test/e2e_kubeadm/controlplane_nodes_test.go index 28f1cd0dca0..987dc05bed5 100644 --- a/test/e2e_kubeadm/controlplane_nodes_test.go +++ b/test/e2e_kubeadm/controlplane_nodes_test.go @@ -20,9 +20,6 @@ import ( "context" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" admissionapi "k8s.io/pod-security-admission/api" @@ -31,10 +28,6 @@ import ( "github.com/onsi/gomega" ) -const ( - controlPlaneLabel = "node-role.kubernetes.io/control-plane" -) - // Define container for all the test specification aimed at verifying // that kubeadm configures the control-plane node as expected var _ = Describe("control-plane node", func() { @@ -51,22 +44,14 @@ var _ = Describe("control-plane node", func() { // in case you can skip this test with SKIP=multi-node ginkgo.It("should be labelled and tainted [multi-node]", func(ctx context.Context) { // get all control-plane nodes (and this implicitly checks that node are properly labeled) - controlPlanes := getControlPlaneNodes(ctx, f.ClientSet) + controlPlanes := framework.GetControlPlaneNodes(ctx, f.ClientSet) // checks if there is at least one control-plane node - gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", controlPlaneLabel) + gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", framework.ControlPlaneLabel) // checks that the control-plane nodes have the expected taints for _, cp := range controlPlanes.Items { - e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: controlPlaneLabel, Effect: corev1.TaintEffectNoSchedule}) + e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: framework.ControlPlaneLabel, Effect: corev1.TaintEffectNoSchedule}) } }) }) - -func getControlPlaneNodes(ctx context.Context, c clientset.Interface) *corev1.NodeList { - selector := labels.Set{controlPlaneLabel: ""}.AsSelector() - cpNodes, err := c.CoreV1().Nodes(). - List(ctx, metav1.ListOptions{LabelSelector: selector.String()}) - framework.ExpectNoError(err, "error reading control-plane nodes") - return cpNodes -} From 3fa898bbc64e20f6a9999a747db47d6957920db1 Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Mon, 2 Sep 2024 09:25:59 -0400 Subject: [PATCH 2/2] Incorporate feedback from review Signed-off-by: Davanum Srinivas --- test/e2e/framework/util.go | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 37d48fafd79..cc2e77518ba 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -40,7 +40,6 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" - "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/sets" @@ -130,7 +129,7 @@ const ( // SnapshotDeleteTimeout is how long for snapshot to delete snapshotContent. SnapshotDeleteTimeout = 5 * time.Minute - // ControlPlaneLabel is valid for kubeadm based clusters like kops ONLY + // ControlPlaneLabel is valid label for kubeadm based clusters like kops ONLY ControlPlaneLabel = "node-role.kubernetes.io/control-plane" ) @@ -711,11 +710,29 @@ func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]str // GetControlPlaneNodes returns a list of control plane nodes func GetControlPlaneNodes(ctx context.Context, c clientset.Interface) *v1.NodeList { - selector := labels.Set{ControlPlaneLabel: ""}.AsSelector() - cpNodes, err := c.CoreV1().Nodes(). - List(ctx, metav1.ListOptions{LabelSelector: selector.String()}) - ExpectNoError(err, "error reading control-plane nodes") - return cpNodes + allNodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + ExpectNoError(err, "error reading all nodes") + + var cpNodes v1.NodeList + + for _, node := range allNodes.Items { + // Check for the control plane label + if _, hasLabel := node.Labels[ControlPlaneLabel]; hasLabel { + cpNodes.Items = append(cpNodes.Items, node) + continue + } + + // Check for the specific taint + for _, taint := range node.Spec.Taints { + // NOTE the taint key is the same as the control plane label + if taint.Key == ControlPlaneLabel && taint.Effect == v1.TaintEffectNoSchedule { + cpNodes.Items = append(cpNodes.Items, node) + continue + } + } + } + + return &cpNodes } // GetControlPlaneAddresses returns all IP addresses on which the kubelet can reach the control plane.