mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-31 07:20:13 +00:00
commit
9df413e0e1
281
test/e2e/reboot.go
Normal file
281
test/e2e/reboot.go
Normal file
@ -0,0 +1,281 @@
|
||||
/*
|
||||
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const (
|
||||
// How long to pause between polling node or pod status.
|
||||
poll = 5 * time.Second
|
||||
|
||||
// How long nodes have to be "ready" before the reboot. They should already
|
||||
// be "ready" before the test starts, so this is small.
|
||||
nodeReadyInitialTimeout = 20 * time.Second
|
||||
|
||||
// How long pods have to be "ready" before the reboot. They should already
|
||||
// be "ready" before the test starts, so this is small.
|
||||
podReadyBeforeTimeout = 20 * time.Second
|
||||
|
||||
// How long a node is allowed to go from "Ready" to "NotReady" after a
|
||||
// reboot is issued before the test is considered failed.
|
||||
rebootNotReadyTimeout = 2 * time.Minute
|
||||
|
||||
// How long a node is allowed to go from "NotReady" to "Ready" after a
|
||||
// reboot is issued and it is found to be "NotReady" before the test is
|
||||
// considered failed.
|
||||
rebootReadyAgainTimeout = 5 * time.Minute
|
||||
|
||||
// How long pods have to be "ready" after the reboot.
|
||||
podReadyAgainTimeout = 5 * time.Minute
|
||||
)
|
||||
|
||||
var _ = Describe("Reboot", func() {
|
||||
BeforeEach(func() {
|
||||
var err error
|
||||
c, err = loadClient()
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
})
|
||||
|
||||
It("should reboot each node and ensure they function upon restart", func() {
|
||||
// This test requires SSH, so the provider check should be identical to
|
||||
// there (the limiting factor is the implementation of util.go's
|
||||
// getSigner(...)).
|
||||
provider := testContext.Provider
|
||||
if !providerIs("gce") {
|
||||
By(fmt.Sprintf("Skipping reboot test, which is not implemented for %s", provider))
|
||||
return
|
||||
}
|
||||
|
||||
// Get all nodes, and kick off the test on each.
|
||||
nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
|
||||
if err != nil {
|
||||
Failf("Error getting nodes: %v", err)
|
||||
}
|
||||
result := make(chan bool, len(nodelist.Items))
|
||||
for _, n := range nodelist.Items {
|
||||
go rebootNode(c, provider, n.ObjectMeta.Name, result)
|
||||
}
|
||||
|
||||
// Wait for all to finish and check the final result.
|
||||
failed := false
|
||||
// TODO(mbforbes): Change to `for range` syntax and remove logging once
|
||||
// we support only Go >= 1.4.
|
||||
for _, n := range nodelist.Items {
|
||||
if !<-result {
|
||||
Failf("Node %s failed reboot test.", n.ObjectMeta.Name)
|
||||
failed = true
|
||||
}
|
||||
}
|
||||
if failed {
|
||||
Failf("Test failed; at least one node failed to reboot in the time given.")
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
// rebootNode takes node name on provider through the following steps using c:
|
||||
// - ensures the node is ready
|
||||
// - ensures all pods on the node are running and ready
|
||||
// - reboots the node
|
||||
// - ensures the node reaches some non-ready state
|
||||
// - ensures the node becomes ready again
|
||||
// - ensures all pods on the node become running and ready again
|
||||
//
|
||||
// It returns true through result only if all of the steps pass; at the first
|
||||
// failed step, it will return false through result and not run the rest.
|
||||
func rebootNode(c *client.Client, provider, name string, result chan bool) {
|
||||
// Get the node initially.
|
||||
Logf("Getting %s", name)
|
||||
node, err := c.Nodes().Get(name)
|
||||
if err != nil {
|
||||
Logf("Gouldn't get node %s", name)
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Node sanity check: ensure it is "ready".
|
||||
if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Get all the pods on the node.
|
||||
podList, err := c.Pods(api.NamespaceDefault).List(
|
||||
labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
|
||||
if err != nil {
|
||||
Logf("Error getting pods for node %s: %v", name, err)
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
podNames := make([]string, len(podList.Items))
|
||||
for i, p := range podList.Items {
|
||||
podNames[i] = p.ObjectMeta.Name
|
||||
}
|
||||
Logf("Node %s has %d pods: %v", name, len(podNames), podNames)
|
||||
|
||||
// For each pod, we do a sanity check to ensure it's running / healthy
|
||||
// now, as that's what we'll be checking later.
|
||||
if !checkPodsRunning(c, podNames, podReadyBeforeTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Reboot the node.
|
||||
Logf("Getting external IP address for %s", name)
|
||||
host := ""
|
||||
for _, a := range node.Status.Addresses {
|
||||
if a.Type == api.NodeExternalIP {
|
||||
host = a.Address + ":22"
|
||||
break
|
||||
}
|
||||
}
|
||||
if host == "" {
|
||||
Logf("Couldn't find external IP address for node %s", name)
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
Logf("Calling reboot on %s", name)
|
||||
rebootCmd := "sudo reboot"
|
||||
if _, _, code, err := SSH(rebootCmd, host, provider); code != 0 || err != nil {
|
||||
Failf("Expected 0 exit code and nil error when running %s on %s, got %d and %v",
|
||||
rebootCmd, node, code, err)
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Wait for some kind of "not ready" status.
|
||||
if !waitForNodeToBeNotReady(c, name, rebootNotReadyTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Wait for some kind of "ready" status.
|
||||
if !waitForNodeToBeReady(c, name, rebootReadyAgainTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure all of the pods that we found on this node before the reboot are
|
||||
// running / healthy.
|
||||
if !checkPodsRunning(c, podNames, podReadyAgainTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
Logf("Reboot successful on node %s", name)
|
||||
result <- true
|
||||
}
|
||||
|
||||
// podRunningReady is the checker function passed to waitForPodCondition(...)
|
||||
// (found in util.go). It ensures that the pods' phase is running and that the
|
||||
// ready condition is true.
|
||||
func podRunningReady(p *api.Pod) (bool, error) {
|
||||
// Check the phase is running.
|
||||
if p.Status.Phase != api.PodRunning {
|
||||
return false, fmt.Errorf("want pod %s on %s to be %v but was %v",
|
||||
p.ObjectMeta.Name, p.Spec.Host, api.PodRunning, p.Status.Phase)
|
||||
}
|
||||
// Check the ready condition is true.
|
||||
for _, cond := range p.Status.Conditions {
|
||||
if cond.Type == api.PodReady && cond.Status == api.ConditionTrue {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, fmt.Errorf("pod %s on %s didn't have condition %v, %v; conditions: %v",
|
||||
p.ObjectMeta.Name, p.Spec.Host, api.PodReady, api.ConditionTrue, p.Status.Conditions)
|
||||
}
|
||||
|
||||
// checkPodsRunning returns whether all pods whose names are listed in podNames
|
||||
// are running.
|
||||
func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool {
|
||||
desc := "running and ready"
|
||||
Logf("Waiting up to %v for the following pods to be %s: %s", timeout, desc, podNames)
|
||||
result := make(chan bool, len(podNames))
|
||||
for _, podName := range podNames {
|
||||
// Don't you just love Go?
|
||||
podName := podName
|
||||
// Launch off pod readiness checkers.
|
||||
go func() {
|
||||
err := waitForPodCondition(c, api.NamespaceDefault, podName, desc,
|
||||
poll, timeout, podRunningReady)
|
||||
result <- err == nil
|
||||
}()
|
||||
}
|
||||
// Wait for them all to finish.
|
||||
success := true
|
||||
// TODO(mbforbes): Change to `for range` syntax and remove logging once we
|
||||
// support only Go >= 1.4.
|
||||
for _, podName := range podNames {
|
||||
if !<-result {
|
||||
Logf("Pod %s failed to be %s.", podName, desc)
|
||||
success = false
|
||||
}
|
||||
}
|
||||
Logf("Wanted all pods to be %s. Result: %t. Pods: %v", desc, success, podNames)
|
||||
return success
|
||||
}
|
||||
|
||||
// waitForNodeToBeReady returns whether node name is ready within timeout.
|
||||
func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
|
||||
return waitForNodeToBe(c, name, true, timeout)
|
||||
}
|
||||
|
||||
// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
|
||||
// readiness condition is anything but ready, e.g false or unknown) within
|
||||
// timeout.
|
||||
func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
|
||||
return waitForNodeToBe(c, name, false, timeout)
|
||||
}
|
||||
|
||||
// waitForNodeToBe returns whether node name's readiness state matches wantReady
|
||||
// within timeout. If wantReady is true, it will ensure the node is ready; if
|
||||
// it's false, it ensures the node is in any state other than ready (e.g. not
|
||||
// ready or unknown).
|
||||
func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
|
||||
Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
|
||||
node, err := c.Nodes().Get(name)
|
||||
if err != nil {
|
||||
Logf("Couldn't get node %s", name)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check the node readiness condition (logging all).
|
||||
for i, cond := range node.Status.Conditions {
|
||||
Logf("Node %s condition %d/%d: type: %v, status: %v",
|
||||
name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
|
||||
// Ensure that the condition type is readiness and the status
|
||||
// matches as desired.
|
||||
if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
|
||||
Logf("Successfully found node %s readiness to be %t", name, wantReady)
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
|
||||
return false
|
||||
}
|
@ -20,10 +20,6 @@ import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
@ -46,28 +42,9 @@ var _ = Describe("SSH", func() {
|
||||
|
||||
// Get all nodes' external IPs.
|
||||
By("Getting all nodes' SSH-able IP addresses")
|
||||
nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
|
||||
hosts, err := NodeSSHHosts(c)
|
||||
if err != nil {
|
||||
Failf("Error getting nodes: %v", err)
|
||||
}
|
||||
hosts := make([]string, 0, len(nodelist.Items))
|
||||
for _, n := range nodelist.Items {
|
||||
for _, addr := range n.Status.Addresses {
|
||||
// Use the first external IP address we find on the node, and
|
||||
// use at most one per node.
|
||||
// NOTE: Until #7412 is fixed this will repeatedly ssh into the
|
||||
// master node and not check any of the minions.
|
||||
if addr.Type == api.NodeExternalIP {
|
||||
hosts = append(hosts, addr.Address+":22")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fail if any node didn't have an external IP.
|
||||
if len(hosts) != len(nodelist.Items) {
|
||||
Failf("Only found %d external IPs on nodes, but found %d nodes. Nodelist: %v",
|
||||
len(hosts), len(nodelist.Items), nodelist)
|
||||
Failf("Error getting node hostnames: %v", err)
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
|
@ -52,6 +52,9 @@ const (
|
||||
|
||||
// String used to mark pod deletion
|
||||
nonExist = "NonExist"
|
||||
|
||||
// How often to poll pods.
|
||||
podPoll = 5 * time.Second
|
||||
)
|
||||
|
||||
type TestContextType struct {
|
||||
@ -88,12 +91,12 @@ func providerIs(providers ...string) bool {
|
||||
|
||||
type podCondition func(pod *api.Pod) (bool, error)
|
||||
|
||||
func waitForPodCondition(c *client.Client, ns, podName, desc string, condition podCondition) error {
|
||||
By(fmt.Sprintf("waiting up to %v for pod %s status to be %s", podStartTimeout, podName, desc))
|
||||
for start := time.Now(); time.Since(start) < podStartTimeout; time.Sleep(5 * time.Second) {
|
||||
func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error {
|
||||
Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
|
||||
pod, err := c.Pods(ns).Get(podName)
|
||||
if err != nil {
|
||||
Logf("Get pod %v in ns %v failed, ignoring for 5s: %v", podName, ns, err)
|
||||
Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err)
|
||||
continue
|
||||
}
|
||||
done, err := condition(pod)
|
||||
@ -102,7 +105,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, condition p
|
||||
}
|
||||
Logf("Waiting for pod %s in namespace %s status to be %q (found %q) (%v)", podName, ns, desc, pod.Status.Phase, time.Since(start))
|
||||
}
|
||||
return fmt.Errorf("gave up waiting for pod %s to be %s after %.2f seconds", podName, desc, podStartTimeout.Seconds())
|
||||
return fmt.Errorf("gave up waiting for pod %s to be %s after %v", podName, desc, timeout)
|
||||
}
|
||||
|
||||
// createNS should be used by every test, note that we append a common prefix to the provided test name.
|
||||
@ -119,7 +122,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error)
|
||||
}
|
||||
|
||||
func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error {
|
||||
return waitForPodCondition(c, namespace, podName, "running", func(pod *api.Pod) (bool, error) {
|
||||
return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
return (pod.Status.Phase == api.PodRunning), nil
|
||||
})
|
||||
}
|
||||
@ -130,7 +133,7 @@ func waitForPodRunning(c *client.Client, podName string) error {
|
||||
|
||||
// waitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
|
||||
func waitForPodNotPending(c *client.Client, ns, podName string) error {
|
||||
return waitForPodCondition(c, ns, podName, "!pending", func(pod *api.Pod) (bool, error) {
|
||||
return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
if pod.Status.Phase != api.PodPending {
|
||||
Logf("Saw pod %s in namespace %s out of pending state (found %q)", podName, ns, pod.Status.Phase)
|
||||
return true, nil
|
||||
@ -141,7 +144,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error {
|
||||
|
||||
// waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long.
|
||||
func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error {
|
||||
return waitForPodCondition(c, namespace, podName, "success or failure", func(pod *api.Pod) (bool, error) {
|
||||
return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
// Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632
|
||||
ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName)
|
||||
if !ok {
|
||||
@ -741,6 +744,37 @@ func BadEvents(events []*api.Event) int {
|
||||
return badEvents
|
||||
}
|
||||
|
||||
// NodeSSHHosts returns SSH-able host names for all nodes. It returns an error
|
||||
// if it can't find an external IP for every node, though it still returns all
|
||||
// hosts that it found in that case.
|
||||
func NodeSSHHosts(c *client.Client) ([]string, error) {
|
||||
var hosts []string
|
||||
nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
|
||||
if err != nil {
|
||||
return hosts, fmt.Errorf("error getting nodes: %v", err)
|
||||
}
|
||||
for _, n := range nodelist.Items {
|
||||
for _, addr := range n.Status.Addresses {
|
||||
// Use the first external IP address we find on the node, and
|
||||
// use at most one per node.
|
||||
// TODO(mbforbes): Use the "preferred" address for the node, once
|
||||
// such a thing is defined (#2462).
|
||||
if addr.Type == api.NodeExternalIP {
|
||||
hosts = append(hosts, addr.Address+":22")
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Error if any node didn't have an external IP.
|
||||
if len(hosts) != len(nodelist.Items) {
|
||||
return hosts, fmt.Errorf(
|
||||
"only found %d external IPs on nodes, but found %d nodes. Nodelist: %v",
|
||||
len(hosts), len(nodelist.Items), nodelist)
|
||||
}
|
||||
return hosts, nil
|
||||
}
|
||||
|
||||
// SSH synchronously SSHs to a node running on provider and runs cmd. If there
|
||||
// is no error performing the SSH, the stdout, stderr, and exit code are
|
||||
// returned.
|
||||
|
Loading…
Reference in New Issue
Block a user