diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index c8917972f2a..225d9bb08bc 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -197,7 +197,6 @@ GCE_PARALLEL_SKIP_TESTS=( "Resource\susage\sof\ssystem\scontainers" "SchedulerPredicates" "resource\susage\stracking" - "NodeOutOfDisk" "${DISRUPTIVE_TESTS[@]}" ) diff --git a/test/e2e/es_cluster_logging.go b/test/e2e/es_cluster_logging.go index 41a3b890bab..d2a6bca38c9 100644 --- a/test/e2e/es_cluster_logging.go +++ b/test/e2e/es_cluster_logging.go @@ -185,7 +185,7 @@ func ClusterLevelLoggingWithElasticsearch(f *Framework) { // Previous tests may have cause failures of some nodes. Let's skip // 'Not Ready' nodes, just in case (there is no need to fail the test). filterNodes(nodes, func(node api.Node) bool { - return isNodeConditionSetAsExpected(&node, api.NodeReady, true) + return isNodeReadySetAsExpected(&node, true) }) if len(nodes.Items) < 2 { Failf("Less than two nodes were found Ready: %d", len(nodes.Items)) diff --git a/test/e2e/networking.go b/test/e2e/networking.go index 00ef24588b5..84a4fcacd19 100644 --- a/test/e2e/networking.go +++ b/test/e2e/networking.go @@ -145,7 +145,7 @@ var _ = Describe("Networking", func() { // previous tests may have cause failures of some nodes. Let's skip // 'Not Ready' nodes, just in case (there is no need to fail the test). filterNodes(nodes, func(node api.Node) bool { - return isNodeConditionSetAsExpected(&node, api.NodeReady, true) + return isNodeReadySetAsExpected(&node, true) }) if len(nodes.Items) == 0 { diff --git a/test/e2e/nodeoutofdisk.go b/test/e2e/nodeoutofdisk.go deleted file mode 100644 index 2ff607a1a5e..00000000000 --- a/test/e2e/nodeoutofdisk.go +++ /dev/null @@ -1,245 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2e - -import ( - "encoding/json" - "fmt" - "time" - - cadvisorapi "github.com/google/cadvisor/info/v1" - "k8s.io/kubernetes/pkg/api" - "k8s.io/kubernetes/pkg/api/resource" - client "k8s.io/kubernetes/pkg/client/unversioned" - "k8s.io/kubernetes/pkg/fields" - "k8s.io/kubernetes/pkg/labels" - "k8s.io/kubernetes/pkg/util/wait" - - . "github.com/onsi/ginkgo" - . "github.com/onsi/gomega" -) - -const ( - mb = 1024 * 1024 - gb = 1024 * mb - - // TODO(madhusudancs): find a way to query kubelet's disk space manager to obtain this value. 256MB - // is the default that is set today. This test might break if the default value changes. This value - // can be configured by setting the "low-diskspace-threshold-mb" flag while starting a kubelet. - // However, kubelets are started as part of the cluster start up, once, before any e2e test is run, - // and remain unchanged until all the tests are run and the cluster is brought down. Changing the - // flag value affects all the e2e tests. So we are hard-coding this value for now. - lowDiskSpaceThreshold uint64 = 256 * mb - - nodeOODTimeOut = 1 * time.Minute - - numNodeOODPods = 3 -) - -// Plan: -// 1. Fill disk space on all nodes except one. One node is left out so that we can schedule pods -// on that node. Arbitrarily choose that node to be node with index 0. -// 2. Get the CPU capacity on unfilled node. -// 3. Divide the CPU capacity into one less than the number of pods we want to schedule. We want -// to schedule 3 pods, so divide CPU capacity by 2. -// 4. Request the divided capacity for each pod. -// 5. Observe that 2 of the pods schedule onto the node whose disk is not full, and the remaining -// pod stays pending and does not schedule onto the nodes whose disks are full nor the node -// with the other two pods, since there is not enough free CPU capacity there. -// 6. Recover disk space from one of the nodes whose disk space was previously filled. Arbritrarily -// choose that node to be node with index 1. -// 7. Observe that the pod in pending status schedules on that node. -// -var _ = Describe("NodeOutOfDisk", func() { - var c *client.Client - var unfilledNodeName, recoveredNodeName string - framework := Framework{BaseName: "node-outofdisk"} - - BeforeEach(func() { - framework.beforeEach() - c = framework.Client - - nodelist, err := listNodes(c, labels.Everything(), fields.Everything()) - expectNoError(err, "Error retrieving nodes") - Expect(len(nodelist.Items)).To(BeNumerically(">", 1)) - - unfilledNodeName = nodelist.Items[0].Name - for _, node := range nodelist.Items[1:] { - fillDiskSpace(c, &node) - } - }) - - AfterEach(func() { - defer framework.afterEach() - - nodelist, err := listNodes(c, labels.Everything(), fields.Everything()) - expectNoError(err, "Error retrieving nodes") - Expect(len(nodelist.Items)).ToNot(BeZero()) - for _, node := range nodelist.Items { - if unfilledNodeName == node.Name || recoveredNodeName == node.Name { - continue - } - recoverDiskSpace(c, &node) - } - }) - - It("runs out of disk space", func() { - unfilledNode, err := c.Nodes().Get(unfilledNodeName) - expectNoError(err) - - By(fmt.Sprintf("Get CPU capacity on node %s", unfilledNode.Name)) - - milliCPU := unfilledNode.Status.Capacity.Cpu().MilliValue() - // Per pod CPU should be just enough to fit only (numNodeOODPods - 1) pods on the - // given node. We compute this value by dividing the available CPU capacity on the given - // node by (numNodeOODPods - 1) and subtracting ϵ from it. - podCPU := (milliCPU / (numNodeOODPods - 1)) - (milliCPU / 5) - - ns := framework.Namespace.Name - podClient := c.Pods(ns) - - By("Creating pods and waiting for all but one pods to be scheduled") - - for i := 0; i < numNodeOODPods-1; i++ { - name := fmt.Sprintf("pod-node-outofdisk-%d", i) - createOutOfDiskPod(c, ns, name, podCPU) - - expectNoError(framework.WaitForPodRunning(name)) - pod, err := podClient.Get(name) - expectNoError(err) - Expect(pod.Spec.NodeName).To(Equal(unfilledNodeName)) - } - - pendingPodName := fmt.Sprintf("pod-node-outofdisk-%d", numNodeOODPods-1) - createOutOfDiskPod(c, ns, pendingPodName, podCPU) - - By(fmt.Sprintf("Finding a failed scheduler event for pod %s", pendingPodName)) - wait.Poll(2*time.Second, 5*time.Minute, func() (bool, error) { - schedEvents, err := c.Events(ns).List( - labels.Everything(), - fields.Set{ - "involvedObject.kind": "Pod", - "involvedObject.name": pendingPodName, - "involvedObject.namespace": ns, - "source": "scheduler", - "reason": "FailedScheduling", - }.AsSelector()) - expectNoError(err) - - if len(schedEvents.Items) > 0 { - return true, nil - } else { - return false, nil - } - }) - - nodelist, err := listNodes(c, labels.Everything(), fields.Everything()) - expectNoError(err, "Error retrieving nodes") - Expect(len(nodelist.Items)).To(BeNumerically(">", 1)) - - nodeToRecover := nodelist.Items[1] - Expect(nodeToRecover.Name).ToNot(Equal(unfilledNodeName)) - - By(fmt.Sprintf("Recovering disk space on node %s", nodeToRecover.Name)) - recoverDiskSpace(c, &nodeToRecover) - recoveredNodeName = nodeToRecover.Name - - By(fmt.Sprintf("Verifying that pod %s schedules on node %s", pendingPodName, recoveredNodeName)) - expectNoError(framework.WaitForPodRunning(pendingPodName)) - pendingPod, err := podClient.Get(pendingPodName) - expectNoError(err) - Expect(pendingPod.Spec.NodeName).To(Equal(recoveredNodeName)) - }) -}) - -// createOutOfDiskPod creates a pod in the given namespace with the requested amount of CPU. -func createOutOfDiskPod(c *client.Client, ns, name string, milliCPU int64) { - podClient := c.Pods(ns) - - pod := &api.Pod{ - ObjectMeta: api.ObjectMeta{ - Name: name, - }, - Spec: api.PodSpec{ - Containers: []api.Container{ - { - Name: "pause", - Image: "beta.gcr.io/google_containers/pause:2.0", - Resources: api.ResourceRequirements{ - Requests: api.ResourceList{ - // Request enough CPU to fit only two pods on a given node. - api.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), - }, - }, - }, - }, - }, - } - - _, err := podClient.Create(pod) - expectNoError(err) -} - -// availSize returns the available disk space on a given node by querying node stats which -// is in turn obtained internally from cadvisor. -func availSize(c *client.Client, node *api.Node) (uint64, error) { - statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name) - Logf("Querying stats for node %s using url %s", node.Name, statsResource) - res, err := c.Get().AbsPath(statsResource).Timeout(timeout).Do().Raw() - if err != nil { - return 0, fmt.Errorf("error querying cAdvisor API: %v", err) - } - ci := cadvisorapi.ContainerInfo{} - err = json.Unmarshal(res, &ci) - if err != nil { - return 0, fmt.Errorf("couldn't unmarshal container info: %v", err) - } - return ci.Stats[len(ci.Stats)-1].Filesystem[0].Available, nil -} - -// fillDiskSpace fills the available disk space on a given node by creating a large file. The disk -// space on the node is filled in such a way that the available space after filling the disk is just -// below the lowDiskSpaceThreshold mark. -func fillDiskSpace(c *client.Client, node *api.Node) { - avail, err := availSize(c, node) - expectNoError(err, "Node %s: couldn't obtain available disk size %v", node.Name, err) - - fillSize := (avail - lowDiskSpaceThreshold + (100 * mb)) - - Logf("Node %s: disk space available %d bytes", node.Name, avail) - By(fmt.Sprintf("Node %s: creating a file of size %d bytes to fill the available disk space", node.Name, fillSize)) - - cmd := fmt.Sprintf("fallocate -l %d test.img", fillSize) - expectNoError(issueSSHCommand(cmd, testContext.Provider, node)) - - ood := waitForNodeToBe(c, node.Name, api.NodeOutOfDisk, true, nodeOODTimeOut) - Expect(ood).To(BeTrue(), "Node %s did not run out of disk within %v", node.Name, nodeOODTimeOut) - - avail, err = availSize(c, node) - Logf("Node %s: disk space available %d bytes", node.Name, avail) - Expect(avail < lowDiskSpaceThreshold).To(BeTrue()) -} - -// recoverDiskSpace recovers disk space, filled by creating a large file, on a given node. -func recoverDiskSpace(c *client.Client, node *api.Node) { - By(fmt.Sprintf("Recovering disk space on node %s", node.Name)) - cmd := "rm -f test.img" - expectNoError(issueSSHCommand(cmd, testContext.Provider, node)) - - ood := waitForNodeToBe(c, node.Name, api.NodeOutOfDisk, false, nodeOODTimeOut) - Expect(ood).To(BeTrue(), "Node %s's out of disk condition status did not change to false within %v", node.Name, nodeOODTimeOut) -} diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go index cec051a0a06..ee44fe6992d 100644 --- a/test/e2e/reboot.go +++ b/test/e2e/reboot.go @@ -149,6 +149,25 @@ func testReboot(c *client.Client, rebootCmd string) { } } +func issueSSHCommand(node *api.Node, provider, cmd string) error { + Logf("Getting external IP address for %s", node.Name) + host := "" + for _, a := range node.Status.Addresses { + if a.Type == api.NodeExternalIP { + host = a.Address + ":22" + break + } + } + if host == "" { + return fmt.Errorf("couldn't find external IP address for node %s", node.Name) + } + Logf("Calling %s on %s", cmd, node.Name) + if _, _, code, err := SSH(cmd, host, provider); code != 0 || err != nil { + return fmt.Errorf("when running %s on %s, got %d and %v", cmd, node.Name, code, err) + } + return nil +} + // rebootNode takes node name on provider through the following steps using c: // - ensures the node is ready // - ensures all pods on the node are running and ready @@ -203,7 +222,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string) bool { } // Reboot the node. - if err = issueSSHCommand(rebootCmd, provider, node); err != nil { + if err = issueSSHCommand(node, provider, rebootCmd); err != nil { Logf("Error while issuing ssh command: %v", err) return false } diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index aa264203175..06cc68ec254 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -354,7 +354,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica }() Logf("Waiting %v to ensure node %s is ready before beginning test...", resizeNodeReadyTimeout, node.Name) - if !waitForNodeToBe(c, node.Name, api.NodeReady, true, resizeNodeReadyTimeout) { + if !waitForNodeToBe(c, node.Name, true, resizeNodeReadyTimeout) { Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout) } @@ -370,7 +370,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica } Logf("Waiting %v for node %s to be not ready after simulated network failure", resizeNodeNotReadyTimeout, node.Name) - if !waitForNodeToBe(c, node.Name, api.NodeReady, false, resizeNodeNotReadyTimeout) { + if !waitForNodeToBe(c, node.Name, false, resizeNodeNotReadyTimeout) { Failf("Node %s did not become not-ready within %v", node.Name, resizeNodeNotReadyTimeout) } diff --git a/test/e2e/util.go b/test/e2e/util.go index 3f780c88543..49a7ccdaf94 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -1956,25 +1956,6 @@ func sshCore(cmd, host, provider string, verbose bool) (string, string, int, err return stdout, stderr, code, err } -func issueSSHCommand(cmd, provider string, node *api.Node) error { - Logf("Getting external IP address for %s", node.Name) - host := "" - for _, a := range node.Status.Addresses { - if a.Type == api.NodeExternalIP { - host = a.Address + ":22" - break - } - } - if host == "" { - return fmt.Errorf("couldn't find external IP address for node %s", node.Name) - } - Logf("Calling %s on %s", cmd, node.Name) - if _, _, code, err := SSH(cmd, host, provider); code != 0 || err != nil { - return fmt.Errorf("when running %s on %s, got %d and %v", cmd, node.Name, code, err) - } - return nil -} - // NewHostExecPodSpec returns the pod spec of hostexec pod func NewHostExecPodSpec(ns, name string) *api.Pod { pod := &api.Pod{ @@ -2077,37 +2058,38 @@ func checkPodsRunningReady(c *client.Client, ns string, podNames []string, timeo // waitForNodeToBeReady returns whether node name is ready within timeout. func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool { - return waitForNodeToBe(c, name, api.NodeReady, true, timeout) + return waitForNodeToBe(c, name, true, timeout) } // waitForNodeToBeNotReady returns whether node name is not ready (i.e. the // readiness condition is anything but ready, e.g false or unknown) within // timeout. func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool { - return waitForNodeToBe(c, name, api.NodeReady, false, timeout) + return waitForNodeToBe(c, name, false, timeout) } -func isNodeConditionSetAsExpected(node *api.Node, conditionType api.NodeConditionType, wantTrue bool) bool { +func isNodeReadySetAsExpected(node *api.Node, wantReady bool) bool { // Check the node readiness condition (logging all). for i, cond := range node.Status.Conditions { Logf("Node %s condition %d/%d: type: %v, status: %v, reason: %q, message: %q, last transition time: %v", node.Name, i+1, len(node.Status.Conditions), cond.Type, cond.Status, cond.Reason, cond.Message, cond.LastTransitionTime) - // Ensure that the condition type and the status matches as desired. - if cond.Type == conditionType && (cond.Status == api.ConditionTrue) == wantTrue { - Logf("Successfully found condition %s of node %s to be %t", conditionType, node.Name, wantTrue) + // Ensure that the condition type is readiness and the status + // matches as desired. + if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady { + Logf("Successfully found node %s readiness to be %t", node.Name, wantReady) return true } } return false } -// waitForNodeToBe returns whether node "name's" condition state matches wantTrue -// within timeout. If wantTrue is true, it will ensure the node condition status -// is ConditionTrue; if it's false, it ensures the node condition is in any state -// other than ConditionTrue (e.g. not true or unknown). -func waitForNodeToBe(c *client.Client, name string, conditionType api.NodeConditionType, wantTrue bool, timeout time.Duration) bool { - Logf("Waiting up to %v for node %s condition %s to be %t", timeout, name, conditionType, wantTrue) +// waitForNodeToBe returns whether node name's readiness state matches wantReady +// within timeout. If wantReady is true, it will ensure the node is ready; if +// it's false, it ensures the node is in any state other than ready (e.g. not +// ready or unknown). +func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool { + Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady) for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { node, err := c.Nodes().Get(name) if err != nil { @@ -2115,11 +2097,11 @@ func waitForNodeToBe(c *client.Client, name string, conditionType api.NodeCondit continue } - if isNodeConditionSetAsExpected(node, conditionType, wantTrue) { + if isNodeReadySetAsExpected(node, wantReady) { return true } } - Logf("Node %s didn't reach desired %s condition status (%t) within %v", name, conditionType, wantTrue, timeout) + Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout) return false } @@ -2135,7 +2117,7 @@ func allNodesReady(c *client.Client, timeout time.Duration) error { return false, err } for _, node := range nodes.Items { - if !isNodeConditionSetAsExpected(&node, api.NodeReady, true) { + if !isNodeReadySetAsExpected(&node, true) { notReady = append(notReady, node) } } @@ -2236,7 +2218,7 @@ func waitForClusterSize(c *client.Client, size int, timeout time.Duration) error // Filter out not-ready nodes. filterNodes(nodes, func(node api.Node) bool { - return isNodeConditionSetAsExpected(&node, api.NodeReady, true) + return isNodeReadySetAsExpected(&node, true) }) numReady := len(nodes.Items)