Merge pull request #16398 from madhusudancs/out-of-disk-e2e

Auto commit by PR queue bot
2025-09-14 21:53:52 +00:00 · 2015-11-18 21:49:22 -08:00
parent 81d66f8367 706d768e20
commit 5b2e339b24
7 changed files with 286 additions and 41 deletions
--- a/hack/jenkins/e2e.sh
+++ b/hack/jenkins/e2e.sh
@@ -197,6 +197,7 @@ GCE_PARALLEL_SKIP_TESTS=(
    "Resource\susage\sof\ssystem\scontainers"
    "SchedulerPredicates"
    "resource\susage\stracking"
+    "NodeOutOfDisk"
    "${DISRUPTIVE_TESTS[@]}"
    )

--- a/test/e2e/es_cluster_logging.go
+++ b/test/e2e/es_cluster_logging.go
@@ -185,7 +185,7 @@ func ClusterLevelLoggingWithElasticsearch(f *Framework) {
 	// Previous tests may have cause failures of some nodes. Let's skip
 	// 'Not Ready' nodes, just in case (there is no need to fail the test).
 	filterNodes(nodes, func(node api.Node) bool {
-		return isNodeReadySetAsExpected(&node, true)
+		return isNodeConditionSetAsExpected(&node, api.NodeReady, true)
 	})
 	if len(nodes.Items) < 2 {
 		Failf("Less than two nodes were found Ready: %d", len(nodes.Items))
--- a/test/e2e/networking.go
+++ b/test/e2e/networking.go
@@ -145,7 +145,7 @@ var _ = Describe("Networking", func() {
 		// previous tests may have cause failures of some nodes. Let's skip
 		// 'Not Ready' nodes, just in case (there is no need to fail the test).
 		filterNodes(nodes, func(node api.Node) bool {
-			return isNodeReadySetAsExpected(&node, true)
+			return isNodeConditionSetAsExpected(&node, api.NodeReady, true)
 		})

 		if len(nodes.Items) == 0 {
--- a/test/e2e/nodeoutofdisk.go
+++ b/test/e2e/nodeoutofdisk.go
@@ -0,0 +1,245 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"encoding/json"
+	"fmt"
+	"time"
+
+	cadvisorapi "github.com/google/cadvisor/info/v1"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/resource"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+	"k8s.io/kubernetes/pkg/fields"
+	"k8s.io/kubernetes/pkg/labels"
+	"k8s.io/kubernetes/pkg/util/wait"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+const (
+	mb = 1024 * 1024
+	gb = 1024 * mb
+
+	// TODO(madhusudancs): find a way to query kubelet's disk space manager to obtain this value. 256MB
+	// is the default that is set today. This test might break if the default value changes. This value
+	// can be configured by setting the "low-diskspace-threshold-mb" flag while starting a kubelet.
+	// However, kubelets are started as part of the cluster start up, once, before any e2e test is run,
+	// and remain unchanged until all the tests are run and the cluster is brought down. Changing the
+	// flag value affects all the e2e tests. So we are hard-coding this value for now.
+	lowDiskSpaceThreshold uint64 = 256 * mb
+
+	nodeOODTimeOut = 1 * time.Minute
+
+	numNodeOODPods = 3
+)
+
+// Plan:
+// 1. Fill disk space on all nodes except one. One node is left out so that we can schedule pods
+//    on that node. Arbitrarily choose that node to be node with index 0.
+// 2. Get the CPU capacity on unfilled node.
+// 3. Divide the CPU capacity into one less than the number of pods we want to schedule. We want
+//    to schedule 3 pods, so divide CPU capacity by 2.
+// 4. Request the divided capacity for each pod.
+// 5. Observe that 2 of the pods schedule onto the node whose disk is not full, and the remaining
+//    pod stays pending and does not schedule onto the nodes whose disks are full nor the node
+//    with the other two pods, since there is not enough free CPU capacity there.
+// 6. Recover disk space from one of the nodes whose disk space was previously filled. Arbritrarily
+//    choose that node to be node with index 1.
+// 7. Observe that the pod in pending status schedules on that node.
+//
+var _ = Describe("NodeOutOfDisk", func() {
+	var c *client.Client
+	var unfilledNodeName, recoveredNodeName string
+	framework := Framework{BaseName: "node-outofdisk"}
+
+	BeforeEach(func() {
+		framework.beforeEach()
+		c = framework.Client
+
+		nodelist, err := listNodes(c, labels.Everything(), fields.Everything())
+		expectNoError(err, "Error retrieving nodes")
+		Expect(len(nodelist.Items)).To(BeNumerically(">", 1))
+
+		unfilledNodeName = nodelist.Items[0].Name
+		for _, node := range nodelist.Items[1:] {
+			fillDiskSpace(c, &node)
+		}
+	})
+
+	AfterEach(func() {
+		defer framework.afterEach()
+
+		nodelist, err := listNodes(c, labels.Everything(), fields.Everything())
+		expectNoError(err, "Error retrieving nodes")
+		Expect(len(nodelist.Items)).ToNot(BeZero())
+		for _, node := range nodelist.Items {
+			if unfilledNodeName == node.Name || recoveredNodeName == node.Name {
+				continue
+			}
+			recoverDiskSpace(c, &node)
+		}
+	})
+
+	It("runs out of disk space", func() {
+		unfilledNode, err := c.Nodes().Get(unfilledNodeName)
+		expectNoError(err)
+
+		By(fmt.Sprintf("Get CPU capacity on node %s", unfilledNode.Name))
+
+		milliCPU := unfilledNode.Status.Capacity.Cpu().MilliValue()
+		// Per pod CPU should be just enough to fit only (numNodeOODPods - 1) pods on the
+		// given node. We compute this value by dividing the available CPU capacity on the given
+		// node by (numNodeOODPods - 1) and subtracting ϵ from it.
+		podCPU := (milliCPU / (numNodeOODPods - 1)) - (milliCPU / 5)
+
+		ns := framework.Namespace.Name
+		podClient := c.Pods(ns)
+
+		By("Creating pods and waiting for all but one pods to be scheduled")
+
+		for i := 0; i < numNodeOODPods-1; i++ {
+			name := fmt.Sprintf("pod-node-outofdisk-%d", i)
+			createOutOfDiskPod(c, ns, name, podCPU)
+
+			expectNoError(framework.WaitForPodRunning(name))
+			pod, err := podClient.Get(name)
+			expectNoError(err)
+			Expect(pod.Spec.NodeName).To(Equal(unfilledNodeName))
+		}
+
+		pendingPodName := fmt.Sprintf("pod-node-outofdisk-%d", numNodeOODPods-1)
+		createOutOfDiskPod(c, ns, pendingPodName, podCPU)
+
+		By(fmt.Sprintf("Finding a failed scheduler event for pod %s", pendingPodName))
+		wait.Poll(2*time.Second, 5*time.Minute, func() (bool, error) {
+			schedEvents, err := c.Events(ns).List(
+				labels.Everything(),
+				fields.Set{
+					"involvedObject.kind":      "Pod",
+					"involvedObject.name":      pendingPodName,
+					"involvedObject.namespace": ns,
+					"source":                   "scheduler",
+					"reason":                   "FailedScheduling",
+				}.AsSelector())
+			expectNoError(err)
+
+			if len(schedEvents.Items) > 0 {
+				return true, nil
+			} else {
+				return false, nil
+			}
+		})
+
+		nodelist, err := listNodes(c, labels.Everything(), fields.Everything())
+		expectNoError(err, "Error retrieving nodes")
+		Expect(len(nodelist.Items)).To(BeNumerically(">", 1))
+
+		nodeToRecover := nodelist.Items[1]
+		Expect(nodeToRecover.Name).ToNot(Equal(unfilledNodeName))
+
+		By(fmt.Sprintf("Recovering disk space on node %s", nodeToRecover.Name))
+		recoverDiskSpace(c, &nodeToRecover)
+		recoveredNodeName = nodeToRecover.Name
+
+		By(fmt.Sprintf("Verifying that pod %s schedules on node %s", pendingPodName, recoveredNodeName))
+		expectNoError(framework.WaitForPodRunning(pendingPodName))
+		pendingPod, err := podClient.Get(pendingPodName)
+		expectNoError(err)
+		Expect(pendingPod.Spec.NodeName).To(Equal(recoveredNodeName))
+	})
+})
+
+// createOutOfDiskPod creates a pod in the given namespace with the requested amount of CPU.
+func createOutOfDiskPod(c *client.Client, ns, name string, milliCPU int64) {
+	podClient := c.Pods(ns)
+
+	pod := &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name: name,
+		},
+		Spec: api.PodSpec{
+			Containers: []api.Container{
+				{
+					Name:  "pause",
+					Image: "beta.gcr.io/google_containers/pause:2.0",
+					Resources: api.ResourceRequirements{
+						Requests: api.ResourceList{
+							// Request enough CPU to fit only two pods on a given node.
+							api.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
+						},
+					},
+				},
+			},
+		},
+	}
+
+	_, err := podClient.Create(pod)
+	expectNoError(err)
+}
+
+// availSize returns the available disk space on a given node by querying node stats which
+// is in turn obtained internally from cadvisor.
+func availSize(c *client.Client, node *api.Node) (uint64, error) {
+	statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name)
+	Logf("Querying stats for node %s using url %s", node.Name, statsResource)
+	res, err := c.Get().AbsPath(statsResource).Timeout(timeout).Do().Raw()
+	if err != nil {
+		return 0, fmt.Errorf("error querying cAdvisor API: %v", err)
+	}
+	ci := cadvisorapi.ContainerInfo{}
+	err = json.Unmarshal(res, &ci)
+	if err != nil {
+		return 0, fmt.Errorf("couldn't unmarshal container info: %v", err)
+	}
+	return ci.Stats[len(ci.Stats)-1].Filesystem[0].Available, nil
+}
+
+// fillDiskSpace fills the available disk space on a given node by creating a large file. The disk
+// space on the node is filled in such a way that the available space after filling the disk is just
+// below the lowDiskSpaceThreshold mark.
+func fillDiskSpace(c *client.Client, node *api.Node) {
+	avail, err := availSize(c, node)
+	expectNoError(err, "Node %s: couldn't obtain available disk size %v", node.Name, err)
+
+	fillSize := (avail - lowDiskSpaceThreshold + (100 * mb))
+
+	Logf("Node %s: disk space available %d bytes", node.Name, avail)
+	By(fmt.Sprintf("Node %s: creating a file of size %d bytes to fill the available disk space", node.Name, fillSize))
+
+	cmd := fmt.Sprintf("fallocate -l %d test.img", fillSize)
+	expectNoError(issueSSHCommand(cmd, testContext.Provider, node))
+
+	ood := waitForNodeToBe(c, node.Name, api.NodeOutOfDisk, true, nodeOODTimeOut)
+	Expect(ood).To(BeTrue(), "Node %s did not run out of disk within %v", node.Name, nodeOODTimeOut)
+
+	avail, err = availSize(c, node)
+	Logf("Node %s: disk space available %d bytes", node.Name, avail)
+	Expect(avail < lowDiskSpaceThreshold).To(BeTrue())
+}
+
+// recoverDiskSpace recovers disk space, filled by creating a large file, on a given node.
+func recoverDiskSpace(c *client.Client, node *api.Node) {
+	By(fmt.Sprintf("Recovering disk space on node %s", node.Name))
+	cmd := "rm -f test.img"
+	expectNoError(issueSSHCommand(cmd, testContext.Provider, node))
+
+	ood := waitForNodeToBe(c, node.Name, api.NodeOutOfDisk, false, nodeOODTimeOut)
+	Expect(ood).To(BeTrue(), "Node %s's out of disk condition status did not change to false within %v", node.Name, nodeOODTimeOut)
+}
--- a/test/e2e/reboot.go
+++ b/test/e2e/reboot.go
@@ -149,25 +149,6 @@ func testReboot(c *client.Client, rebootCmd string) {
 	}
 }

-func issueSSHCommand(node *api.Node, provider, cmd string) error {
-	Logf("Getting external IP address for %s", node.Name)
-	host := ""
-	for _, a := range node.Status.Addresses {
-		if a.Type == api.NodeExternalIP {
-			host = a.Address + ":22"
-			break
-		}
-	}
-	if host == "" {
-		return fmt.Errorf("couldn't find external IP address for node %s", node.Name)
-	}
-	Logf("Calling %s on %s", cmd, node.Name)
-	if _, _, code, err := SSH(cmd, host, provider); code != 0 || err != nil {
-		return fmt.Errorf("when running %s on %s, got %d and %v", cmd, node.Name, code, err)
-	}
-	return nil
-}
-
 // rebootNode takes node name on provider through the following steps using c:
 //  - ensures the node is ready
 //  - ensures all pods on the node are running and ready
@@ -222,7 +203,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string) bool {
 	}

 	// Reboot the node.
-	if err = issueSSHCommand(node, provider, rebootCmd); err != nil {
+	if err = issueSSHCommand(rebootCmd, provider, node); err != nil {
 		Logf("Error while issuing ssh command: %v", err)
 		return false
 	}
--- a/test/e2e/resize_nodes.go
+++ b/test/e2e/resize_nodes.go
@@ -354,7 +354,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica
 	}()

 	Logf("Waiting %v to ensure node %s is ready before beginning test...", resizeNodeReadyTimeout, node.Name)
-	if !waitForNodeToBe(c, node.Name, true, resizeNodeReadyTimeout) {
+	if !waitForNodeToBe(c, node.Name, api.NodeReady, true, resizeNodeReadyTimeout) {
 		Failf("Node %s did not become ready within %v", node.Name, resizeNodeReadyTimeout)
 	}

@@ -370,7 +370,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica
 	}

 	Logf("Waiting %v for node %s to be not ready after simulated network failure", resizeNodeNotReadyTimeout, node.Name)
-	if !waitForNodeToBe(c, node.Name, false, resizeNodeNotReadyTimeout) {
+	if !waitForNodeToBe(c, node.Name, api.NodeReady, false, resizeNodeNotReadyTimeout) {
 		Failf("Node %s did not become not-ready within %v", node.Name, resizeNodeNotReadyTimeout)
 	}

--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -1956,6 +1956,25 @@ func sshCore(cmd, host, provider string, verbose bool) (string, string, int, err
 	return stdout, stderr, code, err
 }

+func issueSSHCommand(cmd, provider string, node *api.Node) error {
+	Logf("Getting external IP address for %s", node.Name)
+	host := ""
+	for _, a := range node.Status.Addresses {
+		if a.Type == api.NodeExternalIP {
+			host = a.Address + ":22"
+			break
+		}
+	}
+	if host == "" {
+		return fmt.Errorf("couldn't find external IP address for node %s", node.Name)
+	}
+	Logf("Calling %s on %s", cmd, node.Name)
+	if _, _, code, err := SSH(cmd, host, provider); code != 0 || err != nil {
+		return fmt.Errorf("when running %s on %s, got %d and %v", cmd, node.Name, code, err)
+	}
+	return nil
+}
+
 // NewHostExecPodSpec returns the pod spec of hostexec pod
 func NewHostExecPodSpec(ns, name string) *api.Pod {
 	pod := &api.Pod{
@@ -2058,38 +2077,37 @@ func checkPodsRunningReady(c *client.Client, ns string, podNames []string, timeo

 // waitForNodeToBeReady returns whether node name is ready within timeout.
 func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
-	return waitForNodeToBe(c, name, true, timeout)
+	return waitForNodeToBe(c, name, api.NodeReady, true, timeout)
 }

 // waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
 // readiness condition is anything but ready, e.g false or unknown) within
 // timeout.
 func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
-	return waitForNodeToBe(c, name, false, timeout)
+	return waitForNodeToBe(c, name, api.NodeReady, false, timeout)
 }

-func isNodeReadySetAsExpected(node *api.Node, wantReady bool) bool {
+func isNodeConditionSetAsExpected(node *api.Node, conditionType api.NodeConditionType, wantTrue bool) bool {
 	// Check the node readiness condition (logging all).
 	for i, cond := range node.Status.Conditions {
 		Logf("Node %s condition %d/%d: type: %v, status: %v, reason: %q, message: %q, last transition time: %v",
 			node.Name, i+1, len(node.Status.Conditions), cond.Type, cond.Status,
 			cond.Reason, cond.Message, cond.LastTransitionTime)
-		// Ensure that the condition type is readiness and the status
-		// matches as desired.
-		if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
-			Logf("Successfully found node %s readiness to be %t", node.Name, wantReady)
+		// Ensure that the condition type and the status matches as desired.
+		if cond.Type == conditionType && (cond.Status == api.ConditionTrue) == wantTrue {
+			Logf("Successfully found condition %s of node %s to be %t", conditionType, node.Name, wantTrue)
 			return true
 		}
 	}
 	return false
 }

-// waitForNodeToBe returns whether node name's readiness state matches wantReady
-// within timeout. If wantReady is true, it will ensure the node is ready; if
-// it's false, it ensures the node is in any state other than ready (e.g. not
-// ready or unknown).
-func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
-	Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
+// waitForNodeToBe returns whether node "name's" condition state matches wantTrue
+// within timeout. If wantTrue is true, it will ensure the node condition status
+// is ConditionTrue; if it's false, it ensures the node condition is in any state
+// other than ConditionTrue (e.g. not true or unknown).
+func waitForNodeToBe(c *client.Client, name string, conditionType api.NodeConditionType, wantTrue bool, timeout time.Duration) bool {
+	Logf("Waiting up to %v for node %s condition %s to be %t", timeout, name, conditionType, wantTrue)
 	for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
 		node, err := c.Nodes().Get(name)
 		if err != nil {
@@ -2097,11 +2115,11 @@ func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time
 			continue
 		}

-		if isNodeReadySetAsExpected(node, wantReady) {
+		if isNodeConditionSetAsExpected(node, conditionType, wantTrue) {
 			return true
 		}
 	}
-	Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
+	Logf("Node %s didn't reach desired %s condition status (%t) within %v", name, conditionType, wantTrue, timeout)
 	return false
 }

@@ -2117,7 +2135,7 @@ func allNodesReady(c *client.Client, timeout time.Duration) error {
 			return false, err
 		}
 		for _, node := range nodes.Items {
-			if !isNodeReadySetAsExpected(&node, true) {
+			if !isNodeConditionSetAsExpected(&node, api.NodeReady, true) {
 				notReady = append(notReady, node)
 			}
 		}
@@ -2218,7 +2236,7 @@ func waitForClusterSize(c *client.Client, size int, timeout time.Duration) error

 		// Filter out not-ready nodes.
 		filterNodes(nodes, func(node api.Node) bool {
-			return isNodeReadySetAsExpected(&node, true)
+			return isNodeConditionSetAsExpected(&node, api.NodeReady, true)
 		})
 		numReady := len(nodes.Items)