mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-31 13:50:01 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			280 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			280 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
| Copyright 2015 The Kubernetes Authors.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| */
 | |
| 
 | |
| package e2e
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"k8s.io/kubernetes/pkg/api"
 | |
| 	client "k8s.io/kubernetes/pkg/client/unversioned"
 | |
| 	"k8s.io/kubernetes/pkg/fields"
 | |
| 	"k8s.io/kubernetes/pkg/labels"
 | |
| 	"k8s.io/kubernetes/pkg/util/sets"
 | |
| 	"k8s.io/kubernetes/test/e2e/framework"
 | |
| 
 | |
| 	. "github.com/onsi/ginkgo"
 | |
| 	. "github.com/onsi/gomega"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	// How long a node is allowed to go from "Ready" to "NotReady" after a
 | |
| 	// reboot is issued before the test is considered failed.
 | |
| 	rebootNodeNotReadyTimeout = 2 * time.Minute
 | |
| 
 | |
| 	// How long a node is allowed to go from "NotReady" to "Ready" after a
 | |
| 	// reboot is issued and it is found to be "NotReady" before the test is
 | |
| 	// considered failed.
 | |
| 	rebootNodeReadyAgainTimeout = 5 * time.Minute
 | |
| 
 | |
| 	// How long pods have to be "ready" after the reboot.
 | |
| 	rebootPodReadyAgainTimeout = 5 * time.Minute
 | |
| )
 | |
| 
 | |
| var _ = framework.KubeDescribe("Reboot [Disruptive] [Feature:Reboot]", func() {
 | |
| 	var f *framework.Framework
 | |
| 
 | |
| 	BeforeEach(func() {
 | |
| 		// These tests requires SSH to nodes, so the provider check should be identical to there
 | |
| 		// (the limiting factor is the implementation of util.go's framework.GetSigner(...)).
 | |
| 
 | |
| 		// Cluster must support node reboot
 | |
| 		framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
 | |
| 	})
 | |
| 
 | |
| 	AfterEach(func() {
 | |
| 		if CurrentGinkgoTestDescription().Failed {
 | |
| 			// Most of the reboot tests just make sure that addon/system pods are running, so dump
 | |
| 			// events for the kube-system namespace on failures
 | |
| 			namespaceName := api.NamespaceSystem
 | |
| 			By(fmt.Sprintf("Collecting events from namespace %q.", namespaceName))
 | |
| 			events, err := f.Client.Events(namespaceName).List(api.ListOptions{})
 | |
| 			Expect(err).NotTo(HaveOccurred())
 | |
| 
 | |
| 			for _, e := range events.Items {
 | |
| 				framework.Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message)
 | |
| 			}
 | |
| 		}
 | |
| 		// In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a
 | |
| 		// rebooted/deleted node) for up to 5 minutes before all tunnels are dropped and recreated.  Most tests
 | |
| 		// make use of some proxy feature to verify functionality. So, if a reboot test runs right before a test
 | |
| 		// that tries to get logs, for example, we may get unlucky and try to use a closed tunnel to a node that
 | |
| 		// was recently rebooted. There's no good way to framework.Poll for proxies being closed, so we sleep.
 | |
| 		//
 | |
| 		// TODO(cjcullen) reduce this sleep (#19314)
 | |
| 		if framework.ProviderIs("gke") {
 | |
| 			By("waiting 5 minutes for all dead tunnels to be dropped")
 | |
| 			time.Sleep(5 * time.Minute)
 | |
| 		}
 | |
| 	})
 | |
| 
 | |
| 	f = framework.NewDefaultFramework("reboot")
 | |
| 
 | |
| 	It("each node by ordering clean reboot and ensure they function upon restart", func() {
 | |
| 		// clean shutdown and restart
 | |
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted.
 | |
| 		testReboot(f.Client, "nohup sh -c 'sleep 10 && sudo reboot' >/dev/null 2>&1 &")
 | |
| 	})
 | |
| 
 | |
| 	It("each node by ordering unclean reboot and ensure they function upon restart", func() {
 | |
| 		// unclean shutdown and restart
 | |
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is shutdown.
 | |
| 		testReboot(f.Client, "nohup sh -c 'sleep 10 && echo b | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &")
 | |
| 	})
 | |
| 
 | |
| 	It("each node by triggering kernel panic and ensure they function upon restart", func() {
 | |
| 		// kernel panic
 | |
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before kernel panic is triggered.
 | |
| 		testReboot(f.Client, "nohup sh -c 'sleep 10 && echo c | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &")
 | |
| 	})
 | |
| 
 | |
| 	It("each node by switching off the network interface and ensure they function upon switch on", func() {
 | |
| 		// switch the network interface off for a while to simulate a network outage
 | |
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before network is down.
 | |
| 		testReboot(f.Client, "nohup sh -c 'sleep 10 && sudo ifdown eth0 && sleep 120 && sudo ifup eth0' >/dev/null 2>&1 &")
 | |
| 	})
 | |
| 
 | |
| 	It("each node by dropping all inbound packets for a while and ensure they function afterwards", func() {
 | |
| 		// tell the firewall to drop all inbound packets for a while
 | |
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping inbound packets.
 | |
| 		// We still accept packages send from localhost to prevent monit from restarting kubelet.
 | |
| 		testReboot(f.Client, "nohup sh -c 'sleep 10 && sudo iptables -I INPUT 1 -s 127.0.0.1 -j ACCEPT && sudo iptables -I INPUT 2 -j DROP && "+
 | |
| 			" sleep 120 && sudo iptables -D INPUT -j DROP && sudo iptables -D INPUT -s 127.0.0.1 -j ACCEPT' >/dev/null 2>&1 &")
 | |
| 	})
 | |
| 
 | |
| 	It("each node by dropping all outbound packets for a while and ensure they function afterwards", func() {
 | |
| 		// tell the firewall to drop all outbound packets for a while
 | |
| 		// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping outbound packets.
 | |
| 		// We still accept packages send to localhost to prevent monit from restarting kubelet.
 | |
| 		testReboot(f.Client, "nohup sh -c 'sleep 10 &&  sudo iptables -I OUTPUT 1 -s 127.0.0.1 -j ACCEPT && sudo iptables -I OUTPUT 2 -j DROP && "+
 | |
| 			" sleep 120 && sudo iptables -D OUTPUT -j DROP && sudo iptables -D OUTPUT -s 127.0.0.1 -j ACCEPT' >/dev/null 2>&1 &")
 | |
| 	})
 | |
| })
 | |
| 
 | |
| func testReboot(c *client.Client, rebootCmd string) {
 | |
| 	// Get all nodes, and kick off the test on each.
 | |
| 	nodelist := framework.GetReadySchedulableNodesOrDie(c)
 | |
| 	result := make([]bool, len(nodelist.Items))
 | |
| 	wg := sync.WaitGroup{}
 | |
| 	wg.Add(len(nodelist.Items))
 | |
| 
 | |
| 	failed := false
 | |
| 	for ix := range nodelist.Items {
 | |
| 		go func(ix int) {
 | |
| 			defer wg.Done()
 | |
| 			n := nodelist.Items[ix]
 | |
| 			result[ix] = rebootNode(c, framework.TestContext.Provider, n.ObjectMeta.Name, rebootCmd)
 | |
| 			if !result[ix] {
 | |
| 				failed = true
 | |
| 			}
 | |
| 		}(ix)
 | |
| 	}
 | |
| 
 | |
| 	// Wait for all to finish and check the final result.
 | |
| 	wg.Wait()
 | |
| 
 | |
| 	if failed {
 | |
| 		for ix := range nodelist.Items {
 | |
| 			n := nodelist.Items[ix]
 | |
| 			if !result[ix] {
 | |
| 				framework.Logf("Node %s failed reboot test.", n.ObjectMeta.Name)
 | |
| 			}
 | |
| 		}
 | |
| 		framework.Failf("Test failed; at least one node failed to reboot in the time given.")
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func printStatusAndLogsForNotReadyPods(c *client.Client, ns string, podNames []string, pods []*api.Pod) {
 | |
| 	printFn := func(id, log string, err error, previous bool) {
 | |
| 		prefix := "Retrieving log for container"
 | |
| 		if previous {
 | |
| 			prefix = "Retrieving log for the last terminated container"
 | |
| 		}
 | |
| 		if err != nil {
 | |
| 			framework.Logf("%s %s, err: %v:\n%s\n", prefix, id, err, log)
 | |
| 		} else {
 | |
| 			framework.Logf("%s %s:\n%s\n", prefix, id, log)
 | |
| 		}
 | |
| 	}
 | |
| 	podNameSet := sets.NewString(podNames...)
 | |
| 	for _, p := range pods {
 | |
| 		if p.Namespace != ns {
 | |
| 			continue
 | |
| 		}
 | |
| 		if !podNameSet.Has(p.Name) {
 | |
| 			continue
 | |
| 		}
 | |
| 		if ok, _ := framework.PodRunningReady(p); ok {
 | |
| 			continue
 | |
| 		}
 | |
| 		framework.Logf("Status for not ready pod %s/%s: %+v", p.Namespace, p.Name, p.Status)
 | |
| 		// Print the log of the containers if pod is not running and ready.
 | |
| 		for _, container := range p.Status.ContainerStatuses {
 | |
| 			cIdentifer := fmt.Sprintf("%s/%s/%s", p.Namespace, p.Name, container.Name)
 | |
| 			log, err := framework.GetPodLogs(c, p.Namespace, p.Name, container.Name)
 | |
| 			printFn(cIdentifer, log, err, false)
 | |
| 			// Get log from the previous container.
 | |
| 			if container.RestartCount > 0 {
 | |
| 				printFn(cIdentifer, log, err, true)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // rebootNode takes node name on provider through the following steps using c:
 | |
| //  - ensures the node is ready
 | |
| //  - ensures all pods on the node are running and ready
 | |
| //  - reboots the node (by executing rebootCmd over ssh)
 | |
| //  - ensures the node reaches some non-ready state
 | |
| //  - ensures the node becomes ready again
 | |
| //  - ensures all pods on the node become running and ready again
 | |
| //
 | |
| // It returns true through result only if all of the steps pass; at the first
 | |
| // failed step, it will return false through result and not run the rest.
 | |
| func rebootNode(c *client.Client, provider, name, rebootCmd string) bool {
 | |
| 	// Setup
 | |
| 	ns := api.NamespaceSystem
 | |
| 	ps := framework.NewPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(api.PodHostField, name))
 | |
| 	defer ps.Stop()
 | |
| 
 | |
| 	// Get the node initially.
 | |
| 	framework.Logf("Getting %s", name)
 | |
| 	node, err := c.Nodes().Get(name)
 | |
| 	if err != nil {
 | |
| 		framework.Logf("Couldn't get node %s", name)
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	// Node sanity check: ensure it is "ready".
 | |
| 	if !framework.WaitForNodeToBeReady(c, name, framework.NodeReadyInitialTimeout) {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	// Get all the pods on the node that don't have liveness probe set.
 | |
| 	// Liveness probe may cause restart of a pod during node reboot, and the pod may not be running.
 | |
| 	pods := ps.List()
 | |
| 	podNames := []string{}
 | |
| 	for _, p := range pods {
 | |
| 		probe := false
 | |
| 		for _, c := range p.Spec.Containers {
 | |
| 			if c.LivenessProbe != nil {
 | |
| 				probe = true
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 		if !probe {
 | |
| 			podNames = append(podNames, p.ObjectMeta.Name)
 | |
| 		}
 | |
| 	}
 | |
| 	framework.Logf("Node %s has %d assigned pods with no liveness probes: %v", name, len(podNames), podNames)
 | |
| 
 | |
| 	// For each pod, we do a sanity check to ensure it's running / healthy
 | |
| 	// or succeeded now, as that's what we'll be checking later.
 | |
| 	if !framework.CheckPodsRunningReadyOrSucceeded(c, ns, podNames, framework.PodReadyBeforeTimeout) {
 | |
| 		printStatusAndLogsForNotReadyPods(c, ns, podNames, pods)
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	// Reboot the node.
 | |
| 	if err = framework.IssueSSHCommand(rebootCmd, provider, node); err != nil {
 | |
| 		framework.Logf("Error while issuing ssh command: %v", err)
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	// Wait for some kind of "not ready" status.
 | |
| 	if !framework.WaitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	// Wait for some kind of "ready" status.
 | |
| 	if !framework.WaitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	// Ensure all of the pods that we found on this node before the reboot are
 | |
| 	// running / healthy, or succeeded.
 | |
| 	if !framework.CheckPodsRunningReadyOrSucceeded(c, ns, podNames, rebootPodReadyAgainTimeout) {
 | |
| 		newPods := ps.List()
 | |
| 		printStatusAndLogsForNotReadyPods(c, ns, podNames, newPods)
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	framework.Logf("Reboot successful on node %s", name)
 | |
| 	return true
 | |
| }
 |