diff --git a/hack/.golint_failures b/hack/.golint_failures index 6ad0f0e9ca0..62e1024c5c4 100644 --- a/hack/.golint_failures +++ b/hack/.golint_failures @@ -815,6 +815,7 @@ test/e2e/lifecycle test/e2e/lifecycle/bootstrap test/e2e/metrics test/e2e/network +test/e2e/node test/e2e/scalability test/e2e/scheduling test/e2e/storage diff --git a/test/e2e/BUILD b/test/e2e/BUILD index e590b88f415..7db8785ba43 100644 --- a/test/e2e/BUILD +++ b/test/e2e/BUILD @@ -49,8 +49,6 @@ go_test( go_library( name = "go_default_library", srcs = [ - "apparmor.go", - "audit.go", "certificates.go", "dashboard.go", "e2e.go", @@ -61,8 +59,6 @@ go_library( "gke_local_ssd.go", "gke_node_pools.go", "ingress.go", - "kubelet.go", - "kubelet_perf.go", "limit_range.go", "network_partition.go", "no-snat.go", @@ -71,7 +67,6 @@ go_library( "pods.go", "pre_stop.go", "resource_quota.go", - "security_context.go", "service_accounts.go", "service_latency.go", "serviceloadbalancers.go", @@ -87,7 +82,6 @@ go_library( "//pkg/cloudprovider/providers/gce:go_default_library", "//pkg/controller/node:go_default_library", "//pkg/kubelet/apis:go_default_library", - "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", "//pkg/quota/evaluator/core:go_default_library", "//pkg/util/logs:go_default_library", "//pkg/util/version:go_default_library", @@ -169,6 +163,7 @@ filegroup( "//test/e2e/manifest:all-srcs", "//test/e2e/metrics:all-srcs", "//test/e2e/network:all-srcs", + "//test/e2e/node:all-srcs", "//test/e2e/perftype:all-srcs", "//test/e2e/scalability:all-srcs", "//test/e2e/scheduling:all-srcs", diff --git a/test/e2e/node/BUILD b/test/e2e/node/BUILD new file mode 100644 index 00000000000..33c41869c89 --- /dev/null +++ b/test/e2e/node/BUILD @@ -0,0 +1,53 @@ +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) + +load( + "@io_bazel_rules_go//go:def.bzl", + "go_library", +) + +go_library( + name = "go_default_library", + srcs = [ + "apparmor.go", + "audit.go", + "kubelet.go", + "kubelet_perf.go", + "nodeoutofdisk.go", + "security_context.go", + "sig.go", + ], + tags = ["automanaged"], + deps = [ + "//pkg/api/testapi:go_default_library", + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + "//test/e2e/common:go_default_library", + "//test/e2e/framework:go_default_library", + "//test/utils:go_default_library", + "//vendor/github.com/google/cadvisor/info/v1:go_default_library", + "//vendor/github.com/onsi/ginkgo:go_default_library", + "//vendor/github.com/onsi/gomega:go_default_library", + "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/fields:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/client-go/kubernetes:go_default_library", + ], +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) diff --git a/test/e2e/node/OWNERS b/test/e2e/node/OWNERS new file mode 100644 index 00000000000..82e510eaaab --- /dev/null +++ b/test/e2e/node/OWNERS @@ -0,0 +1,9 @@ +approvers: +- Random-Liu +- dchen1107 +- derekwaynecarr +- tallclair +- vishh +- yujuhong +reviewers: +- sig-node-reviewers diff --git a/test/e2e/apparmor.go b/test/e2e/node/apparmor.go similarity index 94% rename from test/e2e/apparmor.go rename to test/e2e/node/apparmor.go index 5bff1f974f9..1876f047334 100644 --- a/test/e2e/apparmor.go +++ b/test/e2e/node/apparmor.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package node import ( "k8s.io/kubernetes/test/e2e/common" @@ -23,7 +23,7 @@ import ( . "github.com/onsi/ginkgo" ) -var _ = framework.KubeDescribe("AppArmor", func() { +var _ = SIGDescribe("AppArmor", func() { f := framework.NewDefaultFramework("apparmor") Context("load AppArmor profiles", func() { diff --git a/test/e2e/audit.go b/test/e2e/node/audit.go similarity index 98% rename from test/e2e/audit.go rename to test/e2e/node/audit.go index 1d5ba5064fe..4a9db9741f1 100644 --- a/test/e2e/audit.go +++ b/test/e2e/node/audit.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package node import ( "bufio" @@ -29,7 +29,7 @@ import ( . "github.com/onsi/gomega" ) -var _ = framework.KubeDescribe("Advanced Audit [Feature:Audit]", func() { +var _ = SIGDescribe("Advanced Audit [Feature:Audit]", func() { f := framework.NewDefaultFramework("audit") It("should audit API calls", func() { diff --git a/test/e2e/kubelet.go b/test/e2e/node/kubelet.go similarity index 98% rename from test/e2e/kubelet.go rename to test/e2e/node/kubelet.go index 37b05d81b31..0e7276c00bd 100644 --- a/test/e2e/kubelet.go +++ b/test/e2e/node/kubelet.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package node import ( "fmt" @@ -270,7 +270,7 @@ func checkPodCleanup(c clientset.Interface, pod *v1.Pod, expectClean bool) { } } -var _ = framework.KubeDescribe("kubelet", func() { +var _ = SIGDescribe("kubelet", func() { var ( c clientset.Interface ns string @@ -282,7 +282,7 @@ var _ = framework.KubeDescribe("kubelet", func() { ns = f.Namespace.Name }) - framework.KubeDescribe("Clean up pods on node", func() { + SIGDescribe("Clean up pods on node", func() { var ( numNodes int nodeNames sets.String @@ -383,7 +383,7 @@ var _ = framework.KubeDescribe("kubelet", func() { }) // Test host cleanup when disrupting the volume environment. - framework.KubeDescribe("host cleanup with volume mounts [sig-storage][HostCleanup][Flaky]", func() { + SIGDescribe("host cleanup with volume mounts [sig-storage][HostCleanup][Flaky]", func() { type hostCleanupTest struct { itDescr string diff --git a/test/e2e/kubelet_perf.go b/test/e2e/node/kubelet_perf.go similarity index 97% rename from test/e2e/kubelet_perf.go rename to test/e2e/node/kubelet_perf.go index 366b6e3751d..f9185345f98 100644 --- a/test/e2e/kubelet_perf.go +++ b/test/e2e/node/kubelet_perf.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package node import ( "fmt" @@ -191,7 +191,7 @@ func verifyCPULimits(expected framework.ContainersCPUSummary, actual framework.N } // Slow by design (1 hour) -var _ = framework.KubeDescribe("Kubelet [Serial] [Slow]", func() { +var _ = SIGDescribe("Kubelet [Serial] [Slow]", func() { var nodeNames sets.String f := framework.NewDefaultFramework("kubelet-perf") var om *framework.RuntimeOperationMonitor @@ -219,7 +219,7 @@ var _ = framework.KubeDescribe("Kubelet [Serial] [Slow]", func() { result := om.GetLatestRuntimeOperationErrorRate() framework.Logf("runtime operation error metrics:\n%s", framework.FormatRuntimeOperationErrorRate(result)) }) - framework.KubeDescribe("regular resource usage tracking", func() { + SIGDescribe("regular resource usage tracking", func() { // We assume that the scheduler will make reasonable scheduling choices // and assign ~N pods on the node. // Although we want to track N pods per node, there are N + add-on pods @@ -271,7 +271,7 @@ var _ = framework.KubeDescribe("Kubelet [Serial] [Slow]", func() { }) } }) - framework.KubeDescribe("experimental resource usage tracking [Feature:ExperimentalResourceUsageTracking]", func() { + SIGDescribe("experimental resource usage tracking [Feature:ExperimentalResourceUsageTracking]", func() { density := []int{100} for i := range density { podsPerNode := density[i] diff --git a/test/e2e/node/nodeoutofdisk.go b/test/e2e/node/nodeoutofdisk.go new file mode 100644 index 00000000000..f2cf9dd2d2f --- /dev/null +++ b/test/e2e/node/nodeoutofdisk.go @@ -0,0 +1,269 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import ( + "encoding/json" + "fmt" + "time" + + cadvisorapi "github.com/google/cadvisor/info/v1" + "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const ( + mb = 1024 * 1024 + gb = 1024 * mb + + // TODO(madhusudancs): find a way to query kubelet's disk space manager to obtain this value. 256MB + // is the default that is set today. This test might break if the default value changes. This value + // can be configured by setting the "low-diskspace-threshold-mb" flag while starting a kubelet. + // However, kubelets are started as part of the cluster start up, once, before any e2e test is run, + // and remain unchanged until all the tests are run and the cluster is brought down. Changing the + // flag value affects all the e2e tests. So we are hard-coding this value for now. + lowDiskSpaceThreshold uint64 = 256 * mb + + nodeOODTimeOut = 5 * time.Minute + + numNodeOODPods = 3 +) + +// Plan: +// 1. Fill disk space on all nodes except one. One node is left out so that we can schedule pods +// on that node. Arbitrarily choose that node to be node with index 0. This makes this a disruptive test. +// 2. Get the CPU capacity on unfilled node. +// 3. Divide the available CPU into one less than the number of pods we want to schedule. We want +// to schedule 3 pods, so divide CPU capacity by 2. +// 4. Request the divided CPU for each pod. +// 5. Observe that 2 of the pods schedule onto the node whose disk is not full, and the remaining +// pod stays pending and does not schedule onto the nodes whose disks are full nor the node +// with the other two pods, since there is not enough free CPU capacity there. +// 6. Recover disk space from one of the nodes whose disk space was previously filled. Arbritrarily +// choose that node to be node with index 1. +// 7. Observe that the pod in pending status schedules on that node. +// +// Flaky issue #20015. We have no clear path for how to test this functionality in a non-flaky way. +var _ = SIGDescribe("NodeOutOfDisk [Serial] [Flaky] [Disruptive]", func() { + var c clientset.Interface + var unfilledNodeName, recoveredNodeName string + f := framework.NewDefaultFramework("node-outofdisk") + + BeforeEach(func() { + c = f.ClientSet + + framework.Skipf("test is broken. #40249") + + nodelist := framework.GetReadySchedulableNodesOrDie(c) + + // Skip this test on small clusters. No need to fail since it is not a use + // case that any cluster of small size needs to support. + framework.SkipUnlessNodeCountIsAtLeast(2) + + unfilledNodeName = nodelist.Items[0].Name + for _, node := range nodelist.Items[1:] { + fillDiskSpace(c, &node) + } + }) + + AfterEach(func() { + + nodelist := framework.GetReadySchedulableNodesOrDie(c) + Expect(len(nodelist.Items)).ToNot(BeZero()) + for _, node := range nodelist.Items { + if unfilledNodeName == node.Name || recoveredNodeName == node.Name { + continue + } + recoverDiskSpace(c, &node) + } + }) + + It("runs out of disk space", func() { + unfilledNode, err := c.Core().Nodes().Get(unfilledNodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + + By(fmt.Sprintf("Calculating CPU availability on node %s", unfilledNode.Name)) + milliCpu, err := availCpu(c, unfilledNode) + framework.ExpectNoError(err) + + // Per pod CPU should be just enough to fit only (numNodeOODPods - 1) pods on the given + // node. We compute this value by dividing the available CPU capacity on the node by + // (numNodeOODPods - 1) and subtracting ϵ from it. We arbitrarily choose ϵ to be 1% + // of the available CPU per pod, i.e. 0.01 * milliCpu/(numNodeOODPods-1). Instead of + // subtracting 1% from the value, we directly use 0.99 as the multiplier. + podCPU := int64(float64(milliCpu/(numNodeOODPods-1)) * 0.99) + + ns := f.Namespace.Name + podClient := c.Core().Pods(ns) + + By("Creating pods and waiting for all but one pods to be scheduled") + + for i := 0; i < numNodeOODPods-1; i++ { + name := fmt.Sprintf("pod-node-outofdisk-%d", i) + createOutOfDiskPod(c, ns, name, podCPU) + + framework.ExpectNoError(f.WaitForPodRunning(name)) + pod, err := podClient.Get(name, metav1.GetOptions{}) + framework.ExpectNoError(err) + Expect(pod.Spec.NodeName).To(Equal(unfilledNodeName)) + } + + pendingPodName := fmt.Sprintf("pod-node-outofdisk-%d", numNodeOODPods-1) + createOutOfDiskPod(c, ns, pendingPodName, podCPU) + + By(fmt.Sprintf("Finding a failed scheduler event for pod %s", pendingPodName)) + wait.Poll(2*time.Second, 5*time.Minute, func() (bool, error) { + selector := fields.Set{ + "involvedObject.kind": "Pod", + "involvedObject.name": pendingPodName, + "involvedObject.namespace": ns, + "source": v1.DefaultSchedulerName, + "reason": "FailedScheduling", + }.AsSelector().String() + options := metav1.ListOptions{FieldSelector: selector} + schedEvents, err := c.Core().Events(ns).List(options) + framework.ExpectNoError(err) + + if len(schedEvents.Items) > 0 { + return true, nil + } else { + return false, nil + } + }) + + nodelist := framework.GetReadySchedulableNodesOrDie(c) + Expect(len(nodelist.Items)).To(BeNumerically(">", 1)) + + nodeToRecover := nodelist.Items[1] + Expect(nodeToRecover.Name).ToNot(Equal(unfilledNodeName)) + + recoverDiskSpace(c, &nodeToRecover) + recoveredNodeName = nodeToRecover.Name + + By(fmt.Sprintf("Verifying that pod %s schedules on node %s", pendingPodName, recoveredNodeName)) + framework.ExpectNoError(f.WaitForPodRunning(pendingPodName)) + pendingPod, err := podClient.Get(pendingPodName, metav1.GetOptions{}) + framework.ExpectNoError(err) + Expect(pendingPod.Spec.NodeName).To(Equal(recoveredNodeName)) + }) +}) + +// createOutOfDiskPod creates a pod in the given namespace with the requested amount of CPU. +func createOutOfDiskPod(c clientset.Interface, ns, name string, milliCPU int64) { + podClient := c.Core().Pods(ns) + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Name: "pause", + Image: framework.GetPauseImageName(c), + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + // Request enough CPU to fit only two pods on a given node. + v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), + }, + }, + }, + }, + }, + } + + _, err := podClient.Create(pod) + framework.ExpectNoError(err) +} + +// availCpu calculates the available CPU on a given node by subtracting the CPU requested by +// all the pods from the total available CPU capacity on the node. +func availCpu(c clientset.Interface, node *v1.Node) (int64, error) { + podClient := c.Core().Pods(metav1.NamespaceAll) + + selector := fields.Set{"spec.nodeName": node.Name}.AsSelector().String() + options := metav1.ListOptions{FieldSelector: selector} + pods, err := podClient.List(options) + if err != nil { + return 0, fmt.Errorf("failed to retrieve all the pods on node %s: %v", node.Name, err) + } + avail := node.Status.Capacity.Cpu().MilliValue() + for _, pod := range pods.Items { + for _, cont := range pod.Spec.Containers { + avail -= cont.Resources.Requests.Cpu().MilliValue() + } + } + return avail, nil +} + +// availSize returns the available disk space on a given node by querying node stats which +// is in turn obtained internally from cadvisor. +func availSize(c clientset.Interface, node *v1.Node) (uint64, error) { + statsResource := fmt.Sprintf("api/v1/proxy/nodes/%s/stats/", node.Name) + framework.Logf("Querying stats for node %s using url %s", node.Name, statsResource) + res, err := c.Core().RESTClient().Get().AbsPath(statsResource).Timeout(time.Minute).Do().Raw() + if err != nil { + return 0, fmt.Errorf("error querying cAdvisor API: %v", err) + } + ci := cadvisorapi.ContainerInfo{} + err = json.Unmarshal(res, &ci) + if err != nil { + return 0, fmt.Errorf("couldn't unmarshal container info: %v", err) + } + return ci.Stats[len(ci.Stats)-1].Filesystem[0].Available, nil +} + +// fillDiskSpace fills the available disk space on a given node by creating a large file. The disk +// space on the node is filled in such a way that the available space after filling the disk is just +// below the lowDiskSpaceThreshold mark. +func fillDiskSpace(c clientset.Interface, node *v1.Node) { + avail, err := availSize(c, node) + framework.ExpectNoError(err, "Node %s: couldn't obtain available disk size %v", node.Name, err) + + fillSize := (avail - lowDiskSpaceThreshold + (100 * mb)) + + framework.Logf("Node %s: disk space available %d bytes", node.Name, avail) + By(fmt.Sprintf("Node %s: creating a file of size %d bytes to fill the available disk space", node.Name, fillSize)) + + cmd := fmt.Sprintf("fallocate -l %d test.img", fillSize) + framework.ExpectNoError(framework.IssueSSHCommand(cmd, framework.TestContext.Provider, node)) + + ood := framework.WaitForNodeToBe(c, node.Name, v1.NodeOutOfDisk, true, nodeOODTimeOut) + Expect(ood).To(BeTrue(), "Node %s did not run out of disk within %v", node.Name, nodeOODTimeOut) + + avail, err = availSize(c, node) + framework.Logf("Node %s: disk space available %d bytes", node.Name, avail) + Expect(avail < lowDiskSpaceThreshold).To(BeTrue()) +} + +// recoverDiskSpace recovers disk space, filled by creating a large file, on a given node. +func recoverDiskSpace(c clientset.Interface, node *v1.Node) { + By(fmt.Sprintf("Recovering disk space on node %s", node.Name)) + cmd := "rm -f test.img" + framework.ExpectNoError(framework.IssueSSHCommand(cmd, framework.TestContext.Provider, node)) + + ood := framework.WaitForNodeToBe(c, node.Name, v1.NodeOutOfDisk, false, nodeOODTimeOut) + Expect(ood).To(BeTrue(), "Node %s's out of disk condition status did not change to false within %v", node.Name, nodeOODTimeOut) +} diff --git a/test/e2e/security_context.go b/test/e2e/node/security_context.go similarity index 98% rename from test/e2e/security_context.go rename to test/e2e/node/security_context.go index f7523eb07be..5dcd8dc4fd8 100644 --- a/test/e2e/security_context.go +++ b/test/e2e/node/security_context.go @@ -20,7 +20,7 @@ limitations under the License. * so they are skipped by default. */ -package e2e +package node import ( "fmt" @@ -59,7 +59,7 @@ func scTestPod(hostIPC bool, hostPID bool) *v1.Pod { return pod } -var _ = framework.KubeDescribe("Security Context [Feature:SecurityContext]", func() { +var _ = SIGDescribe("Security Context [Feature:SecurityContext]", func() { f := framework.NewDefaultFramework("security-context") It("should support pod.Spec.SecurityContext.SupplementalGroups", func() { diff --git a/test/e2e/node/sig.go b/test/e2e/node/sig.go new file mode 100644 index 00000000000..a206bef7008 --- /dev/null +++ b/test/e2e/node/sig.go @@ -0,0 +1,23 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import "k8s.io/kubernetes/test/e2e/framework" + +func SIGDescribe(text string, body func()) bool { + return framework.KubeDescribe("[sig-node] "+text, body) +} diff --git a/test/e2e/serviceloadbalancers.go b/test/e2e/serviceloadbalancers.go index a5dd4868ac4..0c37370d1c1 100644 --- a/test/e2e/serviceloadbalancers.go +++ b/test/e2e/serviceloadbalancers.go @@ -19,6 +19,7 @@ package e2e import ( "fmt" "net/http" + "time" "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -121,7 +122,7 @@ func (h *haproxyControllerTester) start(namespace string) (err error) { // Find the external addresses of the nodes the pods are running on. for _, p := range pods.Items { - wait.Poll(pollInterval, framework.ServiceRespondingTimeout, func() (bool, error) { + wait.Poll(1*time.Second, framework.ServiceRespondingTimeout, func() (bool, error) { address, err := framework.GetHostExternalAddress(h.client, &p) if err != nil { framework.Logf("%v", err) @@ -202,7 +203,7 @@ func (s *ingManager) start(namespace string) (err error) { func (s *ingManager) test(path string) error { url := fmt.Sprintf("%v/hostName", path) httpClient := &http.Client{} - return wait.Poll(pollInterval, framework.ServiceRespondingTimeout, func() (bool, error) { + return wait.Poll(1*time.Second, framework.ServiceRespondingTimeout, func() (bool, error) { body, err := framework.SimpleGET(httpClient, url, "") if err != nil { framework.Logf("%v\n%v\n%v", url, body, err)