Merge pull request #127055 from dims/fix-etcd-failures-in-ci-kubernetes-e2e-cos-gce-disruptive-canary

Fix etcd failures in ci-kubernetes-e2e-cos-gce-disruptive-canary
This commit is contained in:
Kubernetes Prow Robot 2024-09-02 18:07:16 +01:00 committed by GitHub
commit 33c47290e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 84 additions and 71 deletions

View File

@ -34,6 +34,7 @@ import (
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
)
var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
@ -47,7 +48,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
// - master access
// ... so the provider check should be identical to the intersection of
// providers that provide those capabilities.
e2eskipper.SkipUnlessProviderIs("gce")
e2eskipper.SkipUnlessProviderIs("gce", "aws")
e2eskipper.SkipUnlessSSHKeyPresent()
err := e2erc.RunRC(ctx, testutils.RCConfig{
@ -80,7 +81,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
})
func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
doEtcdFailure(ctx, failCommand, fixCommand)
doEtcdFailure(ctx, f, failCommand, fixCommand)
checkExistingRCRecovers(ctx, f)
@ -94,17 +95,30 @@ func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixC
// master and go on to assert that etcd and kubernetes components recover.
const etcdFailureDuration = 20 * time.Second
func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) {
func doEtcdFailure(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
ginkgo.By("failing etcd")
masterExec(ctx, failCommand)
masterExec(ctx, f, failCommand)
time.Sleep(etcdFailureDuration)
masterExec(ctx, fixCommand)
masterExec(ctx, f, fixCommand)
}
func masterExec(ctx context.Context, cmd string) {
host := framework.APIAddress() + ":22"
func masterExec(ctx context.Context, f *framework.Framework, cmd string) {
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
// checks if there is at least one control-plane node
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(),
"at least one node with label %s should exist.", framework.ControlPlaneLabel)
ips := framework.GetNodeExternalIPs(&nodes.Items[0])
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
host := ips[0] + ":22"
result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
framework.ExpectNoError(err)
e2essh.LogResult(result)
result, err = e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd)
if result.Code != 0 {
e2essh.LogResult(result)

View File

@ -19,6 +19,7 @@ package apps
import (
"context"
"fmt"
"github.com/onsi/gomega"
"strconv"
"time"
@ -40,7 +41,6 @@ import (
e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
testfwk "k8s.io/kubernetes/test/integration/framework"
testutils "k8s.io/kubernetes/test/utils"
imageutils "k8s.io/kubernetes/test/utils/image"
admissionapi "k8s.io/pod-security-admission/api"
@ -278,12 +278,18 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
// Requires master ssh access.
e2eskipper.SkipUnlessProviderIs("gce", "aws")
nodes, err := getControlPlaneNodes(ctx, f.ClientSet)
framework.ExpectNoError(err)
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
// checks if there is at least one control-plane node
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel)
for i := range nodes.Items {
ips := framework.GetNodeExternalIPs(&nodes.Items[i])
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
restarter := NewRestartConfig(
getFirstIPforNode(&nodes.Items[i]), "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true)
ips[0], "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true)
restarter.restart(ctx)
// The intent is to ensure the replication controller manager has observed and reported status of
@ -313,11 +319,17 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
ginkgo.It("Scheduler should continue assigning pods to nodes across restart", func(ctx context.Context) {
// Requires master ssh access.
e2eskipper.SkipUnlessProviderIs("gce", "aws")
nodes, err := getControlPlaneNodes(ctx, f.ClientSet)
framework.ExpectNoError(err)
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
// checks if there is at least one control-plane node
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel)
for i := range nodes.Items {
ips := framework.GetNodeExternalIPs(&nodes.Items[i])
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
restarter := NewRestartConfig(
getFirstIPforNode(&nodes.Items[i]), "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true)
ips[0], "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true)
// Create pods while the scheduler is down and make sure the scheduler picks them up by
// scaling the rc to the same size.
@ -367,42 +379,3 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
}
})
})
func getFirstIPforNode(node *v1.Node) string {
var ips []string
ips = append(ips, getAddresses(node, v1.NodeExternalIP)...)
if len(ips) == 0 {
// If ExternalIP isn't set, assume the test programs can reach the InternalIP
ips = append(ips, getAddresses(node, v1.NodeInternalIP)...)
}
if len(ips) == 0 {
framework.Failf("did not find any ip(s) for node: %v", node)
}
return ips[0]
}
func getAddresses(node *v1.Node, addressType v1.NodeAddressType) (ips []string) {
for j := range node.Status.Addresses {
nodeAddress := &node.Status.Addresses[j]
if nodeAddress.Type == addressType && nodeAddress.Address != "" {
ips = append(ips, nodeAddress.Address)
}
}
return
}
func getControlPlaneNodes(ctx context.Context, c clientset.Interface) (nodes *v1.NodeList, err error) {
nodes, err = c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return nil, err
}
testfwk.Filter(nodes, func(node v1.Node) bool {
_, isMaster := node.Labels["node-role.kubernetes.io/master"]
_, isControlPlane := node.Labels["node-role.kubernetes.io/control-plane"]
return isMaster || isControlPlane
})
if len(nodes.Items) == 0 {
return nil, fmt.Errorf("there are currently no ready, schedulable control plane nodes in the cluster")
}
return nodes, nil
}

View File

@ -128,6 +128,9 @@ const (
// SnapshotDeleteTimeout is how long for snapshot to delete snapshotContent.
SnapshotDeleteTimeout = 5 * time.Minute
// ControlPlaneLabel is valid label for kubeadm based clusters like kops ONLY
ControlPlaneLabel = "node-role.kubernetes.io/control-plane"
)
var (
@ -662,6 +665,17 @@ func RunCmdEnv(env []string, command string, args ...string) (string, string, er
return stdout, stderr, nil
}
// GetNodeExternalIPs returns a list of external ip address(es) if any for a node
func GetNodeExternalIPs(node *v1.Node) (ips []string) {
for j := range node.Status.Addresses {
nodeAddress := &node.Status.Addresses[j]
if nodeAddress.Type == v1.NodeExternalIP && nodeAddress.Address != "" {
ips = append(ips, nodeAddress.Address)
}
}
return
}
// getControlPlaneAddresses returns the externalIP, internalIP and hostname fields of control plane nodes.
// If any of these is unavailable, empty slices are returned.
func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]string, []string, []string) {
@ -694,6 +708,33 @@ func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]str
return externalIPs, internalIPs, hostnames
}
// GetControlPlaneNodes returns a list of control plane nodes
func GetControlPlaneNodes(ctx context.Context, c clientset.Interface) *v1.NodeList {
allNodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
ExpectNoError(err, "error reading all nodes")
var cpNodes v1.NodeList
for _, node := range allNodes.Items {
// Check for the control plane label
if _, hasLabel := node.Labels[ControlPlaneLabel]; hasLabel {
cpNodes.Items = append(cpNodes.Items, node)
continue
}
// Check for the specific taint
for _, taint := range node.Spec.Taints {
// NOTE the taint key is the same as the control plane label
if taint.Key == ControlPlaneLabel && taint.Effect == v1.TaintEffectNoSchedule {
cpNodes.Items = append(cpNodes.Items, node)
continue
}
}
}
return &cpNodes
}
// GetControlPlaneAddresses returns all IP addresses on which the kubelet can reach the control plane.
// It may return internal and external IPs, even if we expect for
// e.g. internal IPs to be used (issue #56787), so that we can be

View File

@ -20,9 +20,6 @@ import (
"context"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
admissionapi "k8s.io/pod-security-admission/api"
@ -31,10 +28,6 @@ import (
"github.com/onsi/gomega"
)
const (
controlPlaneLabel = "node-role.kubernetes.io/control-plane"
)
// Define container for all the test specification aimed at verifying
// that kubeadm configures the control-plane node as expected
var _ = Describe("control-plane node", func() {
@ -51,22 +44,14 @@ var _ = Describe("control-plane node", func() {
// in case you can skip this test with SKIP=multi-node
ginkgo.It("should be labelled and tainted [multi-node]", func(ctx context.Context) {
// get all control-plane nodes (and this implicitly checks that node are properly labeled)
controlPlanes := getControlPlaneNodes(ctx, f.ClientSet)
controlPlanes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
// checks if there is at least one control-plane node
gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", controlPlaneLabel)
gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", framework.ControlPlaneLabel)
// checks that the control-plane nodes have the expected taints
for _, cp := range controlPlanes.Items {
e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: controlPlaneLabel, Effect: corev1.TaintEffectNoSchedule})
e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: framework.ControlPlaneLabel, Effect: corev1.TaintEffectNoSchedule})
}
})
})
func getControlPlaneNodes(ctx context.Context, c clientset.Interface) *corev1.NodeList {
selector := labels.Set{controlPlaneLabel: ""}.AsSelector()
cpNodes, err := c.CoreV1().Nodes().
List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
framework.ExpectNoError(err, "error reading control-plane nodes")
return cpNodes
}