mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-06 18:54:06 +00:00
Merge pull request #127055 from dims/fix-etcd-failures-in-ci-kubernetes-e2e-cos-gce-disruptive-canary
Fix etcd failures in ci-kubernetes-e2e-cos-gce-disruptive-canary
This commit is contained in:
commit
33c47290e2
@ -34,6 +34,7 @@ import (
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
"github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
||||
@ -47,7 +48,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
||||
// - master access
|
||||
// ... so the provider check should be identical to the intersection of
|
||||
// providers that provide those capabilities.
|
||||
e2eskipper.SkipUnlessProviderIs("gce")
|
||||
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
||||
e2eskipper.SkipUnlessSSHKeyPresent()
|
||||
|
||||
err := e2erc.RunRC(ctx, testutils.RCConfig{
|
||||
@ -80,7 +81,7 @@ var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
|
||||
})
|
||||
|
||||
func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
|
||||
doEtcdFailure(ctx, failCommand, fixCommand)
|
||||
doEtcdFailure(ctx, f, failCommand, fixCommand)
|
||||
|
||||
checkExistingRCRecovers(ctx, f)
|
||||
|
||||
@ -94,17 +95,30 @@ func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixC
|
||||
// master and go on to assert that etcd and kubernetes components recover.
|
||||
const etcdFailureDuration = 20 * time.Second
|
||||
|
||||
func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) {
|
||||
func doEtcdFailure(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
|
||||
ginkgo.By("failing etcd")
|
||||
|
||||
masterExec(ctx, failCommand)
|
||||
masterExec(ctx, f, failCommand)
|
||||
time.Sleep(etcdFailureDuration)
|
||||
masterExec(ctx, fixCommand)
|
||||
masterExec(ctx, f, fixCommand)
|
||||
}
|
||||
|
||||
func masterExec(ctx context.Context, cmd string) {
|
||||
host := framework.APIAddress() + ":22"
|
||||
func masterExec(ctx context.Context, f *framework.Framework, cmd string) {
|
||||
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||
// checks if there is at least one control-plane node
|
||||
|
||||
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(),
|
||||
"at least one node with label %s should exist.", framework.ControlPlaneLabel)
|
||||
|
||||
ips := framework.GetNodeExternalIPs(&nodes.Items[0])
|
||||
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
|
||||
|
||||
host := ips[0] + ":22"
|
||||
result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
|
||||
framework.ExpectNoError(err)
|
||||
e2essh.LogResult(result)
|
||||
|
||||
result, err = e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
|
||||
framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd)
|
||||
if result.Code != 0 {
|
||||
e2essh.LogResult(result)
|
||||
|
@ -19,6 +19,7 @@ package apps
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/onsi/gomega"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@ -40,7 +41,6 @@ import (
|
||||
e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
|
||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
|
||||
testfwk "k8s.io/kubernetes/test/integration/framework"
|
||||
testutils "k8s.io/kubernetes/test/utils"
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
@ -278,12 +278,18 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
|
||||
|
||||
// Requires master ssh access.
|
||||
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
||||
nodes, err := getControlPlaneNodes(ctx, f.ClientSet)
|
||||
framework.ExpectNoError(err)
|
||||
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||
|
||||
// checks if there is at least one control-plane node
|
||||
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel)
|
||||
|
||||
for i := range nodes.Items {
|
||||
|
||||
ips := framework.GetNodeExternalIPs(&nodes.Items[i])
|
||||
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
|
||||
|
||||
restarter := NewRestartConfig(
|
||||
getFirstIPforNode(&nodes.Items[i]), "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true)
|
||||
ips[0], "kube-controller", ports.KubeControllerManagerPort, restartPollInterval, restartTimeout, true)
|
||||
restarter.restart(ctx)
|
||||
|
||||
// The intent is to ensure the replication controller manager has observed and reported status of
|
||||
@ -313,11 +319,17 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
|
||||
ginkgo.It("Scheduler should continue assigning pods to nodes across restart", func(ctx context.Context) {
|
||||
// Requires master ssh access.
|
||||
e2eskipper.SkipUnlessProviderIs("gce", "aws")
|
||||
nodes, err := getControlPlaneNodes(ctx, f.ClientSet)
|
||||
framework.ExpectNoError(err)
|
||||
nodes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||
|
||||
// checks if there is at least one control-plane node
|
||||
gomega.Expect(nodes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist.", framework.ControlPlaneLabel)
|
||||
|
||||
for i := range nodes.Items {
|
||||
ips := framework.GetNodeExternalIPs(&nodes.Items[i])
|
||||
gomega.Expect(ips).NotTo(gomega.BeEmpty(), "at least one external ip should exist.")
|
||||
|
||||
restarter := NewRestartConfig(
|
||||
getFirstIPforNode(&nodes.Items[i]), "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true)
|
||||
ips[0], "kube-scheduler", kubeschedulerconfig.DefaultKubeSchedulerPort, restartPollInterval, restartTimeout, true)
|
||||
|
||||
// Create pods while the scheduler is down and make sure the scheduler picks them up by
|
||||
// scaling the rc to the same size.
|
||||
@ -367,42 +379,3 @@ var _ = SIGDescribe("DaemonRestart", framework.WithDisruptive(), func() {
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
func getFirstIPforNode(node *v1.Node) string {
|
||||
var ips []string
|
||||
ips = append(ips, getAddresses(node, v1.NodeExternalIP)...)
|
||||
if len(ips) == 0 {
|
||||
// If ExternalIP isn't set, assume the test programs can reach the InternalIP
|
||||
ips = append(ips, getAddresses(node, v1.NodeInternalIP)...)
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
framework.Failf("did not find any ip(s) for node: %v", node)
|
||||
}
|
||||
return ips[0]
|
||||
}
|
||||
|
||||
func getAddresses(node *v1.Node, addressType v1.NodeAddressType) (ips []string) {
|
||||
for j := range node.Status.Addresses {
|
||||
nodeAddress := &node.Status.Addresses[j]
|
||||
if nodeAddress.Type == addressType && nodeAddress.Address != "" {
|
||||
ips = append(ips, nodeAddress.Address)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func getControlPlaneNodes(ctx context.Context, c clientset.Interface) (nodes *v1.NodeList, err error) {
|
||||
nodes, err = c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
testfwk.Filter(nodes, func(node v1.Node) bool {
|
||||
_, isMaster := node.Labels["node-role.kubernetes.io/master"]
|
||||
_, isControlPlane := node.Labels["node-role.kubernetes.io/control-plane"]
|
||||
return isMaster || isControlPlane
|
||||
})
|
||||
if len(nodes.Items) == 0 {
|
||||
return nil, fmt.Errorf("there are currently no ready, schedulable control plane nodes in the cluster")
|
||||
}
|
||||
return nodes, nil
|
||||
}
|
||||
|
@ -128,6 +128,9 @@ const (
|
||||
|
||||
// SnapshotDeleteTimeout is how long for snapshot to delete snapshotContent.
|
||||
SnapshotDeleteTimeout = 5 * time.Minute
|
||||
|
||||
// ControlPlaneLabel is valid label for kubeadm based clusters like kops ONLY
|
||||
ControlPlaneLabel = "node-role.kubernetes.io/control-plane"
|
||||
)
|
||||
|
||||
var (
|
||||
@ -662,6 +665,17 @@ func RunCmdEnv(env []string, command string, args ...string) (string, string, er
|
||||
return stdout, stderr, nil
|
||||
}
|
||||
|
||||
// GetNodeExternalIPs returns a list of external ip address(es) if any for a node
|
||||
func GetNodeExternalIPs(node *v1.Node) (ips []string) {
|
||||
for j := range node.Status.Addresses {
|
||||
nodeAddress := &node.Status.Addresses[j]
|
||||
if nodeAddress.Type == v1.NodeExternalIP && nodeAddress.Address != "" {
|
||||
ips = append(ips, nodeAddress.Address)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// getControlPlaneAddresses returns the externalIP, internalIP and hostname fields of control plane nodes.
|
||||
// If any of these is unavailable, empty slices are returned.
|
||||
func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]string, []string, []string) {
|
||||
@ -694,6 +708,33 @@ func getControlPlaneAddresses(ctx context.Context, c clientset.Interface) ([]str
|
||||
return externalIPs, internalIPs, hostnames
|
||||
}
|
||||
|
||||
// GetControlPlaneNodes returns a list of control plane nodes
|
||||
func GetControlPlaneNodes(ctx context.Context, c clientset.Interface) *v1.NodeList {
|
||||
allNodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
|
||||
ExpectNoError(err, "error reading all nodes")
|
||||
|
||||
var cpNodes v1.NodeList
|
||||
|
||||
for _, node := range allNodes.Items {
|
||||
// Check for the control plane label
|
||||
if _, hasLabel := node.Labels[ControlPlaneLabel]; hasLabel {
|
||||
cpNodes.Items = append(cpNodes.Items, node)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for the specific taint
|
||||
for _, taint := range node.Spec.Taints {
|
||||
// NOTE the taint key is the same as the control plane label
|
||||
if taint.Key == ControlPlaneLabel && taint.Effect == v1.TaintEffectNoSchedule {
|
||||
cpNodes.Items = append(cpNodes.Items, node)
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &cpNodes
|
||||
}
|
||||
|
||||
// GetControlPlaneAddresses returns all IP addresses on which the kubelet can reach the control plane.
|
||||
// It may return internal and external IPs, even if we expect for
|
||||
// e.g. internal IPs to be used (issue #56787), so that we can be
|
||||
|
@ -20,9 +20,6 @@ import (
|
||||
"context"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
@ -31,10 +28,6 @@ import (
|
||||
"github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const (
|
||||
controlPlaneLabel = "node-role.kubernetes.io/control-plane"
|
||||
)
|
||||
|
||||
// Define container for all the test specification aimed at verifying
|
||||
// that kubeadm configures the control-plane node as expected
|
||||
var _ = Describe("control-plane node", func() {
|
||||
@ -51,22 +44,14 @@ var _ = Describe("control-plane node", func() {
|
||||
// in case you can skip this test with SKIP=multi-node
|
||||
ginkgo.It("should be labelled and tainted [multi-node]", func(ctx context.Context) {
|
||||
// get all control-plane nodes (and this implicitly checks that node are properly labeled)
|
||||
controlPlanes := getControlPlaneNodes(ctx, f.ClientSet)
|
||||
controlPlanes := framework.GetControlPlaneNodes(ctx, f.ClientSet)
|
||||
|
||||
// checks if there is at least one control-plane node
|
||||
gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", controlPlaneLabel)
|
||||
gomega.Expect(controlPlanes.Items).NotTo(gomega.BeEmpty(), "at least one node with label %s should exist. if you are running test on a single-node cluster, you can skip this test with SKIP=multi-node", framework.ControlPlaneLabel)
|
||||
|
||||
// checks that the control-plane nodes have the expected taints
|
||||
for _, cp := range controlPlanes.Items {
|
||||
e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: controlPlaneLabel, Effect: corev1.TaintEffectNoSchedule})
|
||||
e2enode.ExpectNodeHasTaint(ctx, f.ClientSet, cp.GetName(), &corev1.Taint{Key: framework.ControlPlaneLabel, Effect: corev1.TaintEffectNoSchedule})
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
func getControlPlaneNodes(ctx context.Context, c clientset.Interface) *corev1.NodeList {
|
||||
selector := labels.Set{controlPlaneLabel: ""}.AsSelector()
|
||||
cpNodes, err := c.CoreV1().Nodes().
|
||||
List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
|
||||
framework.ExpectNoError(err, "error reading control-plane nodes")
|
||||
return cpNodes
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user