mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-15 06:01:50 +00:00
Fix restart nodes tests for Regional Clusters
This commit is contained in:
@@ -377,9 +377,9 @@ var _ = SIGDescribe("Network Partition [Disruptive] [Slow]", func() {
|
|||||||
|
|
||||||
nn, err := framework.NumberOfRegisteredNodes(f.ClientSet)
|
nn, err := framework.NumberOfRegisteredNodes(f.ClientSet)
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
nodeNames, err := framework.CheckNodesReady(f.ClientSet, framework.NodeReadyInitialTimeout, nn)
|
nodes, err := framework.CheckNodesReady(f.ClientSet, nn, framework.NodeReadyInitialTimeout)
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
common.RestartNodes(f.ClientSet, nodeNames)
|
common.RestartNodes(f.ClientSet, nodes)
|
||||||
|
|
||||||
By("waiting for pods to be running again")
|
By("waiting for pods to be running again")
|
||||||
pst.WaitForRunningAndReady(*ps.Spec.Replicas, ps)
|
pst.WaitForRunningAndReady(*ps.Spec.Replicas, ps)
|
||||||
|
@@ -41,6 +41,7 @@ go_library(
|
|||||||
"//pkg/client/clientset_generated/internalclientset:go_default_library",
|
"//pkg/client/clientset_generated/internalclientset:go_default_library",
|
||||||
"//pkg/client/conditions:go_default_library",
|
"//pkg/client/conditions:go_default_library",
|
||||||
"//pkg/kubelet:go_default_library",
|
"//pkg/kubelet:go_default_library",
|
||||||
|
"//pkg/kubelet/apis:go_default_library",
|
||||||
"//pkg/kubelet/sysctl:go_default_library",
|
"//pkg/kubelet/sysctl:go_default_library",
|
||||||
"//pkg/security/apparmor:go_default_library",
|
"//pkg/security/apparmor:go_default_library",
|
||||||
"//pkg/util/version:go_default_library",
|
"//pkg/util/version:go_default_library",
|
||||||
|
@@ -26,6 +26,7 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/util/sets"
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
clientset "k8s.io/client-go/kubernetes"
|
clientset "k8s.io/client-go/kubernetes"
|
||||||
|
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||||
|
|
||||||
@@ -98,39 +99,45 @@ func NewRCByName(c clientset.Interface, ns, name string, replicas int32, gracePe
|
|||||||
name, replicas, framework.ServeHostnameImage, 9376, v1.ProtocolTCP, map[string]string{}, gracePeriod))
|
name, replicas, framework.ServeHostnameImage, 9376, v1.ProtocolTCP, map[string]string{}, gracePeriod))
|
||||||
}
|
}
|
||||||
|
|
||||||
func RestartNodes(c clientset.Interface, nodeNames []string) error {
|
func RestartNodes(c clientset.Interface, nodes []v1.Node) error {
|
||||||
// List old boot IDs.
|
// Build mapping from zone to nodes in that zone.
|
||||||
oldBootIDs := make(map[string]string)
|
nodeNamesByZone := make(map[string][]string)
|
||||||
for _, name := range nodeNames {
|
for i := range nodes {
|
||||||
node, err := c.CoreV1().Nodes().Get(name, metav1.GetOptions{})
|
node := &nodes[i]
|
||||||
if err != nil {
|
zone := framework.TestContext.CloudConfig.Zone
|
||||||
return fmt.Errorf("error getting node info before reboot: %s", err)
|
if z, ok := node.Labels[kubeletapis.LabelZoneFailureDomain]; ok {
|
||||||
|
zone = z
|
||||||
}
|
}
|
||||||
oldBootIDs[name] = node.Status.NodeInfo.BootID
|
nodeNamesByZone[zone] = append(nodeNamesByZone[zone], node.Name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reboot the nodes.
|
// Reboot the nodes.
|
||||||
args := []string{
|
for zone, nodeNames := range nodeNamesByZone {
|
||||||
"compute",
|
args := []string{
|
||||||
fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
|
"compute",
|
||||||
"instances",
|
fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
|
||||||
"reset",
|
"instances",
|
||||||
}
|
"reset",
|
||||||
args = append(args, nodeNames...)
|
}
|
||||||
args = append(args, fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone))
|
args = append(args, nodeNames...)
|
||||||
stdout, stderr, err := framework.RunCmd("gcloud", args...)
|
args = append(args, fmt.Sprintf("--zone=%s", zone))
|
||||||
if err != nil {
|
stdout, stderr, err := framework.RunCmd("gcloud", args...)
|
||||||
return fmt.Errorf("error restarting nodes: %s\nstdout: %s\nstderr: %s", err, stdout, stderr)
|
if err != nil {
|
||||||
|
return fmt.Errorf("error restarting nodes: %s\nstdout: %s\nstderr: %s", err, stdout, stderr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for their boot IDs to change.
|
// Wait for their boot IDs to change.
|
||||||
for _, name := range nodeNames {
|
for i := range nodes {
|
||||||
|
node := &nodes[i]
|
||||||
if err := wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
|
if err := wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
|
||||||
node, err := c.CoreV1().Nodes().Get(name, metav1.GetOptions{})
|
newNode, err := c.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, fmt.Errorf("error getting node info after reboot: %s", err)
|
return false, fmt.Errorf("error getting node info after reboot: %s", err)
|
||||||
}
|
}
|
||||||
return node.Status.NodeInfo.BootID != oldBootIDs[name], nil
|
return node.Status.NodeInfo.BootID != newNode.Status.NodeInfo.BootID, nil
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
return fmt.Errorf("error waiting for node %s boot ID to change: %s", name, err)
|
return fmt.Errorf("error waiting for node %s boot ID to change: %s", node.Name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
@@ -24,11 +24,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/fields"
|
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
clientset "k8s.io/client-go/kubernetes"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func EtcdUpgrade(target_storage, target_version string) error {
|
func EtcdUpgrade(target_storage, target_version string) error {
|
||||||
@@ -215,7 +211,7 @@ func NodeUpgrade(f *Framework, v string, img string) error {
|
|||||||
// TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
|
// TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
|
||||||
// GKE; the operation shouldn't return until they all are.
|
// GKE; the operation shouldn't return until they all are.
|
||||||
Logf("Waiting up to %v for all nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout)
|
Logf("Waiting up to %v for all nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout)
|
||||||
if _, err := CheckNodesReady(f.ClientSet, RestartNodeReadyAgainTimeout, TestContext.CloudConfig.NumNodes); err != nil {
|
if _, err := CheckNodesReady(f.ClientSet, TestContext.CloudConfig.NumNodes, RestartNodeReadyAgainTimeout); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@@ -229,7 +225,7 @@ func NodeUpgradeGCEWithKubeProxyDaemonSet(f *Framework, v string, img string, en
|
|||||||
}
|
}
|
||||||
// Wait for it to complete and validate nodes are healthy.
|
// Wait for it to complete and validate nodes are healthy.
|
||||||
Logf("Waiting up to %v for all nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout)
|
Logf("Waiting up to %v for all nodes to be ready after the upgrade", RestartNodeReadyAgainTimeout)
|
||||||
if _, err := CheckNodesReady(f.ClientSet, RestartNodeReadyAgainTimeout, TestContext.CloudConfig.NumNodes); err != nil {
|
if _, err := CheckNodesReady(f.ClientSet, TestContext.CloudConfig.NumNodes, RestartNodeReadyAgainTimeout); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@@ -274,63 +270,6 @@ func nodeUpgradeGKE(v string, img string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// CheckNodesReady waits up to nt for expect nodes accessed by c to be ready,
|
|
||||||
// returning an error if this doesn't happen in time. It returns the names of
|
|
||||||
// nodes it finds.
|
|
||||||
func CheckNodesReady(c clientset.Interface, nt time.Duration, expect int) ([]string, error) {
|
|
||||||
// First, keep getting all of the nodes until we get the number we expect.
|
|
||||||
var nodeList *v1.NodeList
|
|
||||||
var errLast error
|
|
||||||
start := time.Now()
|
|
||||||
found := wait.Poll(Poll, nt, func() (bool, error) {
|
|
||||||
// A rolling-update (GCE/GKE implementation of restart) can complete before the apiserver
|
|
||||||
// knows about all of the nodes. Thus, we retry the list nodes call
|
|
||||||
// until we get the expected number of nodes.
|
|
||||||
nodeList, errLast = c.CoreV1().Nodes().List(metav1.ListOptions{
|
|
||||||
FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector().String()})
|
|
||||||
if errLast != nil {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
if len(nodeList.Items) != expect {
|
|
||||||
errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)",
|
|
||||||
expect, len(nodeList.Items), time.Since(start))
|
|
||||||
Logf("%v", errLast)
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
return true, nil
|
|
||||||
}) == nil
|
|
||||||
nodeNames := make([]string, len(nodeList.Items))
|
|
||||||
for i, n := range nodeList.Items {
|
|
||||||
nodeNames[i] = n.ObjectMeta.Name
|
|
||||||
}
|
|
||||||
if !found {
|
|
||||||
return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
|
|
||||||
expect, nt, errLast)
|
|
||||||
}
|
|
||||||
Logf("Successfully found %d nodes", expect)
|
|
||||||
|
|
||||||
// Next, ensure in parallel that all the nodes are ready. We subtract the
|
|
||||||
// time we spent waiting above.
|
|
||||||
timeout := nt - time.Since(start)
|
|
||||||
result := make(chan bool, len(nodeList.Items))
|
|
||||||
for _, n := range nodeNames {
|
|
||||||
n := n
|
|
||||||
go func() { result <- WaitForNodeToBeReady(c, n, timeout) }()
|
|
||||||
}
|
|
||||||
failed := false
|
|
||||||
|
|
||||||
for i := range nodeList.Items {
|
|
||||||
_ = i
|
|
||||||
if !<-result {
|
|
||||||
failed = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if failed {
|
|
||||||
return nodeNames, fmt.Errorf("at least one node failed to be ready")
|
|
||||||
}
|
|
||||||
return nodeNames, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// MigTemplate (GCE-only) returns the name of the MIG template that the
|
// MigTemplate (GCE-only) returns the name of the MIG template that the
|
||||||
// nodes of the cluster use.
|
// nodes of the cluster use.
|
||||||
func MigTemplate() (string, error) {
|
func MigTemplate() (string, error) {
|
||||||
|
@@ -4123,9 +4123,10 @@ func NumberOfReadyNodes(c clientset.Interface) (int, error) {
|
|||||||
return len(nodes.Items), nil
|
return len(nodes.Items), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitForReadyNodes waits until the cluster has desired size and there is no not-ready nodes in it.
|
// CheckNodesReady waits up to timeout for cluster to has desired size and
|
||||||
// By cluster size we mean number of Nodes excluding Master Node.
|
// there is no not-ready nodes in it. By cluster size we mean number of Nodes
|
||||||
func WaitForReadyNodes(c clientset.Interface, size int, timeout time.Duration) error {
|
// excluding Master Node.
|
||||||
|
func CheckNodesReady(c clientset.Interface, size int, timeout time.Duration) ([]v1.Node, error) {
|
||||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) {
|
for start := time.Now(); time.Since(start) < timeout; time.Sleep(20 * time.Second) {
|
||||||
nodes, err := waitListSchedulableNodes(c)
|
nodes, err := waitListSchedulableNodes(c)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -4142,11 +4143,19 @@ func WaitForReadyNodes(c clientset.Interface, size int, timeout time.Duration) e
|
|||||||
|
|
||||||
if numNodes == size && numReady == size {
|
if numNodes == size && numReady == size {
|
||||||
Logf("Cluster has reached the desired number of ready nodes %d", size)
|
Logf("Cluster has reached the desired number of ready nodes %d", size)
|
||||||
return nil
|
return nodes.Items, nil
|
||||||
}
|
}
|
||||||
Logf("Waiting for ready nodes %d, current ready %d, not ready nodes %d", size, numReady, numNodes-numReady)
|
Logf("Waiting for ready nodes %d, current ready %d, not ready nodes %d", size, numReady, numNodes-numReady)
|
||||||
}
|
}
|
||||||
return fmt.Errorf("timeout waiting %v for number of ready nodes to be %d", timeout, size)
|
return nil, fmt.Errorf("timeout waiting %v for number of ready nodes to be %d", timeout, size)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WaitForReadyNodes waits up to timeout for cluster to has desired size and
|
||||||
|
// there is no not-ready nodes in it. By cluster size we mean number of Nodes
|
||||||
|
// excluding Master Node.
|
||||||
|
func WaitForReadyNodes(c clientset.Interface, size int, timeout time.Duration) error {
|
||||||
|
_, err := CheckNodesReady(c, size, timeout)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func GenerateMasterRegexp(prefix string) string {
|
func GenerateMasterRegexp(prefix string) string {
|
||||||
|
@@ -26,6 +26,7 @@ import (
|
|||||||
"k8s.io/apimachinery/pkg/labels"
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
||||||
|
"k8s.io/kubernetes/test/e2e/common"
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
testutils "k8s.io/kubernetes/test/utils"
|
testutils "k8s.io/kubernetes/test/utils"
|
||||||
|
|
||||||
@@ -55,10 +56,18 @@ func filterIrrelevantPods(pods []*v1.Pod) []*v1.Pod {
|
|||||||
return results
|
return results
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func nodeNames(nodes []v1.Node) []string {
|
||||||
|
result := make([]string, 0, len(nodes))
|
||||||
|
for i := range nodes {
|
||||||
|
result = append(result, nodes[i].Name)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
var _ = SIGDescribe("Restart [Disruptive]", func() {
|
var _ = SIGDescribe("Restart [Disruptive]", func() {
|
||||||
f := framework.NewDefaultFramework("restart")
|
f := framework.NewDefaultFramework("restart")
|
||||||
var ps *testutils.PodStore
|
var ps *testutils.PodStore
|
||||||
var originalNodeNames []string
|
var originalNodes []v1.Node
|
||||||
var originalPodNames []string
|
var originalPodNames []string
|
||||||
var numNodes int
|
var numNodes int
|
||||||
var systemNamespace string
|
var systemNamespace string
|
||||||
@@ -74,9 +83,9 @@ var _ = SIGDescribe("Restart [Disruptive]", func() {
|
|||||||
systemNamespace = metav1.NamespaceSystem
|
systemNamespace = metav1.NamespaceSystem
|
||||||
|
|
||||||
By("ensuring all nodes are ready")
|
By("ensuring all nodes are ready")
|
||||||
originalNodeNames, err = framework.CheckNodesReady(f.ClientSet, framework.NodeReadyInitialTimeout, numNodes)
|
originalNodes, err = framework.CheckNodesReady(f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
|
||||||
Expect(err).NotTo(HaveOccurred())
|
Expect(err).NotTo(HaveOccurred())
|
||||||
framework.Logf("Got the following nodes before restart: %v", originalNodeNames)
|
framework.Logf("Got the following nodes before restart: %v", nodeNames(originalNodes))
|
||||||
|
|
||||||
By("ensuring all pods are running and ready")
|
By("ensuring all pods are running and ready")
|
||||||
allPods := ps.List()
|
allPods := ps.List()
|
||||||
@@ -100,20 +109,20 @@ var _ = SIGDescribe("Restart [Disruptive]", func() {
|
|||||||
|
|
||||||
It("should restart all nodes and ensure all nodes and pods recover", func() {
|
It("should restart all nodes and ensure all nodes and pods recover", func() {
|
||||||
By("restarting all of the nodes")
|
By("restarting all of the nodes")
|
||||||
err := restartNodes(f, originalNodeNames)
|
err := common.RestartNodes(f.ClientSet, originalNodes)
|
||||||
Expect(err).NotTo(HaveOccurred())
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
|
||||||
By("ensuring all nodes are ready after the restart")
|
By("ensuring all nodes are ready after the restart")
|
||||||
nodeNamesAfter, err := framework.CheckNodesReady(f.ClientSet, framework.RestartNodeReadyAgainTimeout, numNodes)
|
nodesAfter, err := framework.CheckNodesReady(f.ClientSet, numNodes, framework.RestartNodeReadyAgainTimeout)
|
||||||
Expect(err).NotTo(HaveOccurred())
|
Expect(err).NotTo(HaveOccurred())
|
||||||
framework.Logf("Got the following nodes after restart: %v", nodeNamesAfter)
|
framework.Logf("Got the following nodes after restart: %v", nodeNames(nodesAfter))
|
||||||
|
|
||||||
// Make sure that we have the same number of nodes. We're not checking
|
// Make sure that we have the same number of nodes. We're not checking
|
||||||
// that the names match because that's implementation specific.
|
// that the names match because that's implementation specific.
|
||||||
By("ensuring the same number of nodes exist after the restart")
|
By("ensuring the same number of nodes exist after the restart")
|
||||||
if len(originalNodeNames) != len(nodeNamesAfter) {
|
if len(originalNodes) != len(nodesAfter) {
|
||||||
framework.Failf("Had %d nodes before nodes were restarted, but now only have %d",
|
framework.Failf("Had %d nodes before nodes were restarted, but now only have %d",
|
||||||
len(originalNodeNames), len(nodeNamesAfter))
|
len(originalNodes), len(nodesAfter))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Make sure that we have the same number of pods. We're not checking
|
// Make sure that we have the same number of pods. We're not checking
|
||||||
@@ -159,41 +168,3 @@ func waitForNPods(ps *testutils.PodStore, expect int, timeout time.Duration) ([]
|
|||||||
}
|
}
|
||||||
return podNames, nil
|
return podNames, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func restartNodes(f *framework.Framework, nodeNames []string) error {
|
|
||||||
// List old boot IDs.
|
|
||||||
oldBootIDs := make(map[string]string)
|
|
||||||
for _, name := range nodeNames {
|
|
||||||
node, err := f.ClientSet.CoreV1().Nodes().Get(name, metav1.GetOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error getting node info before reboot: %s", err)
|
|
||||||
}
|
|
||||||
oldBootIDs[name] = node.Status.NodeInfo.BootID
|
|
||||||
}
|
|
||||||
// Reboot the nodes.
|
|
||||||
args := []string{
|
|
||||||
"compute",
|
|
||||||
fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
|
|
||||||
"instances",
|
|
||||||
"reset",
|
|
||||||
}
|
|
||||||
args = append(args, nodeNames...)
|
|
||||||
args = append(args, fmt.Sprintf("--zone=%s", framework.TestContext.CloudConfig.Zone))
|
|
||||||
stdout, stderr, err := framework.RunCmd("gcloud", args...)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("error restarting nodes: %s\nstdout: %s\nstderr: %s", err, stdout, stderr)
|
|
||||||
}
|
|
||||||
// Wait for their boot IDs to change.
|
|
||||||
for _, name := range nodeNames {
|
|
||||||
if err := wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
|
|
||||||
node, err := f.ClientSet.CoreV1().Nodes().Get(name, metav1.GetOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return false, fmt.Errorf("error getting node info after reboot: %s", err)
|
|
||||||
}
|
|
||||||
return node.Status.NodeInfo.BootID != oldBootIDs[name], nil
|
|
||||||
}); err != nil {
|
|
||||||
return fmt.Errorf("error waiting for node %s boot ID to change: %s", name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
Reference in New Issue
Block a user