From 54c7d8fbb1d9725b37b8836904170560e30c1a89 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 3 Mar 2021 19:14:20 +0100 Subject: [PATCH] e2e: TM: add option to fail instead of skip The Topology Manager e2e tests wants to run on real multi-NUMA system and want to consume real devices supported by device plugins; SRIOV devices happen to be the most commonly available of such devices. CI machines aren't multi NUMA nor expose SRIOV devices, so the biggest portion of the tests will just skip, and we need to keep it like this until we figure out how to enable these features. However, some organizations can and want to run the testsuite on bare metal; in this case, the current test will skip (not fail) with misconfigured boxes, and this reports a misleading result. It will be much better to fail if the test preconditions aren't met. To satisfy both needs, we add an option, controlled by an environment variable, to fail (not skip) if the machine on which the test run doesn't meet the expectations (multi-NUMA, 4+ cores per NUMA cell, expose SRIOV VFs). We keep the old behaviour as default to keep being CI friendly. Signed-off-by: Francesco Romani --- test/e2e/framework/test_context.go | 4 ++ test/e2e_node/e2e_node_suite_test.go | 1 + test/e2e_node/podresources_test.go | 24 +++++----- test/e2e_node/topology_manager_test.go | 63 +++++++++---------------- test/e2e_node/util_sriov.go | 21 +++++++++ test/e2e_node/util_sriov_linux.go | 36 ++++++++++++++ test/e2e_node/util_sriov_unsupported.go | 24 ++++++++++ 7 files changed, 119 insertions(+), 54 deletions(-) create mode 100644 test/e2e_node/util_sriov_linux.go create mode 100644 test/e2e_node/util_sriov_unsupported.go diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go index 81e1204e147..db8cbea44ce 100644 --- a/test/e2e/framework/test_context.go +++ b/test/e2e/framework/test_context.go @@ -187,6 +187,10 @@ type TestContextType struct { // SnapshotControllerHTTPPort the port used for communicating with the snapshot controller HTTP endpoint. SnapshotControllerHTTPPort int + + // RequireDevices makes mandatory on the environment on which tests are run 1+ devices exposed through device plugins. + // With this enabled The e2e tests requiring devices for their operation can assume that if devices aren't reported, the test can fail + RequireDevices bool } // NodeKillerConfig describes configuration of NodeKiller -- a utility to diff --git a/test/e2e_node/e2e_node_suite_test.go b/test/e2e_node/e2e_node_suite_test.go index 9aa958eaf68..9594b5c67d8 100644 --- a/test/e2e_node/e2e_node_suite_test.go +++ b/test/e2e_node/e2e_node_suite_test.go @@ -91,6 +91,7 @@ func registerNodeFlags(flags *flag.FlagSet) { flags.StringVar(&framework.TestContext.SriovdpConfigMapFile, "sriovdp-configmap-file", "", "The name of the SRIOV device plugin Config Map to load.") flag.StringVar(&framework.TestContext.ClusterDNSDomain, "dns-domain", "", "The DNS Domain of the cluster.") flag.Var(cliflag.NewMapStringString(&framework.TestContext.RuntimeConfig), "runtime-config", "The runtime configuration used on node e2e tests.") + flags.BoolVar(&framework.TestContext.RequireDevices, "require-devices", false, "If true, require device plugins to be installed in the running environment.") } func init() { diff --git a/test/e2e_node/podresources_test.go b/test/e2e_node/podresources_test.go index 2384cd9d68c..e72ead5df49 100644 --- a/test/e2e_node/podresources_test.go +++ b/test/e2e_node/podresources_test.go @@ -490,9 +490,8 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P if cpuAlloc < minCoreCount { e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) } - if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount == 0 { - e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device") - } + + requireSRIOVDevices() onlineCPUs, err := getOnlineCPUs() framework.ExpectNoError(err) @@ -532,9 +531,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P ginkgo.It("should return the expected responses with cpumanager none policy", func() { // current default is "none" policy - no need to restart the kubelet - if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount == 0 { - e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device") - } + requireSRIOVDevices() oldCfg := enablePodResourcesFeatureGateInKubelet(f) defer func() { @@ -575,9 +572,8 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P if cpuAlloc < minCoreCount { e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) } - if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 { - e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device") - } + + requireLackOfSRIOVDevices() onlineCPUs, err := getOnlineCPUs() framework.ExpectNoError(err) @@ -606,9 +602,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P ginkgo.It("should return the expected responses with cpumanager none policy", func() { // current default is "none" policy - no need to restart the kubelet - if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 { - e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device") - } + requireLackOfSRIOVDevices() oldCfg := enablePodResourcesFeatureGateInKubelet(f) defer func() { @@ -651,6 +645,12 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P }) }) +func requireLackOfSRIOVDevices() { + if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 { + e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device") + } +} + func getOnlineCPUs() (cpuset.CPUSet, error) { onlineCPUList, err := ioutil.ReadFile("/sys/devices/system/cpu/online") if err != nil { diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go index 9ac3afe1a0e..6b43ffd6256 100644 --- a/test/e2e_node/topology_manager_test.go +++ b/test/e2e_node/topology_manager_test.go @@ -90,20 +90,6 @@ func detectCoresPerSocket() int { return coreCount } -func countSRIOVDevices() (int, error) { - outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/physfn | wc -w").Output() - if err != nil { - return -1, err - } - return strconv.Atoi(strings.TrimSpace(string(outData))) -} - -func detectSRIOVDevices() int { - devCount, err := countSRIOVDevices() - framework.ExpectNoError(err) - return devCount -} - func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Container) { for _, ctnAttr := range ctnAttributes { ctn := v1.Container{ @@ -898,21 +884,7 @@ func runTopologyManagerTests(f *framework.Framework) { }) ginkgo.It("run Topology Manager node alignment test suite", func() { - // this is a very rough check. We just want to rule out system that does NOT have - // any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest - sriovdevCount := detectSRIOVDevices() - numaNodes := detectNUMANodes() - coreCount := detectCoresPerSocket() - - if numaNodes < minNumaNodes { - e2eskipper.Skipf("this test is meant to run on a multi-node NUMA system") - } - if coreCount < minCoreCount { - e2eskipper.Skipf("this test is meant to run on a system with at least 4 cores per socket") - } - if sriovdevCount == 0 { - e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device") - } + numaNodes, coreCount := hostPrecheck() configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) @@ -935,19 +907,7 @@ func runTopologyManagerTests(f *framework.Framework) { }) ginkgo.It("run the Topology Manager pod scope alignment test suite", func() { - sriovdevCount := detectSRIOVDevices() - numaNodes := detectNUMANodes() - coreCount := detectCoresPerSocket() - - if numaNodes < minNumaNodes { - e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system") - } - if coreCount < minCoreCount { - e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount) - } - if sriovdevCount == 0 { - e2eskipper.Skipf("this test is intended to be run on a system with at least one SR-IOV VF enabled") - } + numaNodes, coreCount := hostPrecheck() configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) @@ -968,6 +928,25 @@ func runTopologyManagerTests(f *framework.Framework) { }) } +func hostPrecheck() (int, int) { + // this is a very rough check. We just want to rule out system that does NOT have + // any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest + + numaNodes := detectNUMANodes() + if numaNodes < minNumaNodes { + e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system") + } + + coreCount := detectCoresPerSocket() + if coreCount < minCoreCount { + e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount) + } + + requireSRIOVDevices() + + return numaNodes, coreCount +} + // Serial because the test updates kubelet configuration. var _ = SIGDescribe("Topology Manager [Serial] [Feature:TopologyManager][NodeFeature:TopologyManager]", func() { f := framework.NewDefaultFramework("topology-manager-test") diff --git a/test/e2e_node/util_sriov.go b/test/e2e_node/util_sriov.go index 4b404332157..569315bac53 100644 --- a/test/e2e_node/util_sriov.go +++ b/test/e2e_node/util_sriov.go @@ -16,6 +16,11 @@ limitations under the License. package e2enode +import ( + "k8s.io/kubernetes/test/e2e/framework" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" +) + const ( // SRIOVDevicePluginCMYAML is the path of the config map to configure the sriov device plugin. SRIOVDevicePluginCMYAML = "test/e2e_node/testing-manifests/sriovdp-cm.yaml" @@ -26,3 +31,19 @@ const ( // SRIOVDevicePluginName is the name of the device plugin pod SRIOVDevicePluginName = "sriov-device-plugin" ) + +func requireSRIOVDevices() { + sriovdevCount, err := countSRIOVDevices() + framework.ExpectNoError(err) + + if sriovdevCount > 0 { + return // all good + } + + msg := "this test is meant to run on a system with at least one configured VF from SRIOV device" + if framework.TestContext.RequireDevices { + framework.Failf(msg) + } else { + e2eskipper.Skipf(msg) + } +} diff --git a/test/e2e_node/util_sriov_linux.go b/test/e2e_node/util_sriov_linux.go new file mode 100644 index 00000000000..1c516baf083 --- /dev/null +++ b/test/e2e_node/util_sriov_linux.go @@ -0,0 +1,36 @@ +//go:build linux +// +build linux + +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "os/exec" + "strconv" + "strings" +) + +// countSRIOVDevices provides a rough estimate of SRIOV Virtual Functions available on the system. +// This is a rough check we use to rule out unsuitable systems, not to detect suitable systems. +func countSRIOVDevices() (int, error) { + outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/physfn | wc -w").Output() + if err != nil { + return -1, err + } + return strconv.Atoi(strings.TrimSpace(string(outData))) +} diff --git a/test/e2e_node/util_sriov_unsupported.go b/test/e2e_node/util_sriov_unsupported.go new file mode 100644 index 00000000000..812b833f5be --- /dev/null +++ b/test/e2e_node/util_sriov_unsupported.go @@ -0,0 +1,24 @@ +//go:build !linux +// +build !linux + +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +func countSRIOVDevices() (int, error) { + return 0, nil +}