e2e: TM: add option to fail instead of skip

The Topology Manager e2e tests wants to run on real multi-NUMA system
and want to consume real devices supported by device plugins; SRIOV
devices happen to be the most commonly available of such devices.

CI machines aren't multi NUMA nor expose SRIOV devices, so the biggest portion
of the tests will just skip, and we need to keep it like this until we
figure out how to enable these features.

However, some organizations can and want to run the testsuite on bare metal;
in this case, the current test will skip (not fail) with misconfigured
boxes, and this reports a misleading result. It will be much better to
fail if the test preconditions aren't met.

To satisfy both needs, we add an option, controlled by an environment
variable, to fail (not skip) if the machine on which the test run
doesn't meet the expectations (multi-NUMA, 4+ cores per NUMA cell,
expose SRIOV VFs).
We keep the old behaviour as default to keep being CI friendly.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2021-03-03 19:14:20 +01:00
parent dd2d12f6dc
commit 54c7d8fbb1
7 changed files with 119 additions and 54 deletions

View File

@ -187,6 +187,10 @@ type TestContextType struct {
// SnapshotControllerHTTPPort the port used for communicating with the snapshot controller HTTP endpoint.
SnapshotControllerHTTPPort int
// RequireDevices makes mandatory on the environment on which tests are run 1+ devices exposed through device plugins.
// With this enabled The e2e tests requiring devices for their operation can assume that if devices aren't reported, the test can fail
RequireDevices bool
}
// NodeKillerConfig describes configuration of NodeKiller -- a utility to

View File

@ -91,6 +91,7 @@ func registerNodeFlags(flags *flag.FlagSet) {
flags.StringVar(&framework.TestContext.SriovdpConfigMapFile, "sriovdp-configmap-file", "", "The name of the SRIOV device plugin Config Map to load.")
flag.StringVar(&framework.TestContext.ClusterDNSDomain, "dns-domain", "", "The DNS Domain of the cluster.")
flag.Var(cliflag.NewMapStringString(&framework.TestContext.RuntimeConfig), "runtime-config", "The runtime configuration used on node e2e tests.")
flags.BoolVar(&framework.TestContext.RequireDevices, "require-devices", false, "If true, require device plugins to be installed in the running environment.")
}
func init() {

View File

@ -490,9 +490,8 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
if cpuAlloc < minCoreCount {
e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount)
}
if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount == 0 {
e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device")
}
requireSRIOVDevices()
onlineCPUs, err := getOnlineCPUs()
framework.ExpectNoError(err)
@ -532,9 +531,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
ginkgo.It("should return the expected responses with cpumanager none policy", func() {
// current default is "none" policy - no need to restart the kubelet
if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount == 0 {
e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device")
}
requireSRIOVDevices()
oldCfg := enablePodResourcesFeatureGateInKubelet(f)
defer func() {
@ -575,9 +572,8 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
if cpuAlloc < minCoreCount {
e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount)
}
if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 {
e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device")
}
requireLackOfSRIOVDevices()
onlineCPUs, err := getOnlineCPUs()
framework.ExpectNoError(err)
@ -606,9 +602,7 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
ginkgo.It("should return the expected responses with cpumanager none policy", func() {
// current default is "none" policy - no need to restart the kubelet
if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 {
e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device")
}
requireLackOfSRIOVDevices()
oldCfg := enablePodResourcesFeatureGateInKubelet(f)
defer func() {
@ -651,6 +645,12 @@ var _ = SIGDescribe("POD Resources [Serial] [Feature:PodResources][NodeFeature:P
})
})
func requireLackOfSRIOVDevices() {
if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 {
e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device")
}
}
func getOnlineCPUs() (cpuset.CPUSet, error) {
onlineCPUList, err := ioutil.ReadFile("/sys/devices/system/cpu/online")
if err != nil {

View File

@ -90,20 +90,6 @@ func detectCoresPerSocket() int {
return coreCount
}
func countSRIOVDevices() (int, error) {
outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/physfn | wc -w").Output()
if err != nil {
return -1, err
}
return strconv.Atoi(strings.TrimSpace(string(outData)))
}
func detectSRIOVDevices() int {
devCount, err := countSRIOVDevices()
framework.ExpectNoError(err)
return devCount
}
func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Container) {
for _, ctnAttr := range ctnAttributes {
ctn := v1.Container{
@ -898,21 +884,7 @@ func runTopologyManagerTests(f *framework.Framework) {
})
ginkgo.It("run Topology Manager node alignment test suite", func() {
// this is a very rough check. We just want to rule out system that does NOT have
// any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest
sriovdevCount := detectSRIOVDevices()
numaNodes := detectNUMANodes()
coreCount := detectCoresPerSocket()
if numaNodes < minNumaNodes {
e2eskipper.Skipf("this test is meant to run on a multi-node NUMA system")
}
if coreCount < minCoreCount {
e2eskipper.Skipf("this test is meant to run on a system with at least 4 cores per socket")
}
if sriovdevCount == 0 {
e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device")
}
numaNodes, coreCount := hostPrecheck()
configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
@ -935,19 +907,7 @@ func runTopologyManagerTests(f *framework.Framework) {
})
ginkgo.It("run the Topology Manager pod scope alignment test suite", func() {
sriovdevCount := detectSRIOVDevices()
numaNodes := detectNUMANodes()
coreCount := detectCoresPerSocket()
if numaNodes < minNumaNodes {
e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system")
}
if coreCount < minCoreCount {
e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount)
}
if sriovdevCount == 0 {
e2eskipper.Skipf("this test is intended to be run on a system with at least one SR-IOV VF enabled")
}
numaNodes, coreCount := hostPrecheck()
configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
@ -968,6 +928,25 @@ func runTopologyManagerTests(f *framework.Framework) {
})
}
func hostPrecheck() (int, int) {
// this is a very rough check. We just want to rule out system that does NOT have
// any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest
numaNodes := detectNUMANodes()
if numaNodes < minNumaNodes {
e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system")
}
coreCount := detectCoresPerSocket()
if coreCount < minCoreCount {
e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount)
}
requireSRIOVDevices()
return numaNodes, coreCount
}
// Serial because the test updates kubelet configuration.
var _ = SIGDescribe("Topology Manager [Serial] [Feature:TopologyManager][NodeFeature:TopologyManager]", func() {
f := framework.NewDefaultFramework("topology-manager-test")

View File

@ -16,6 +16,11 @@ limitations under the License.
package e2enode
import (
"k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
)
const (
// SRIOVDevicePluginCMYAML is the path of the config map to configure the sriov device plugin.
SRIOVDevicePluginCMYAML = "test/e2e_node/testing-manifests/sriovdp-cm.yaml"
@ -26,3 +31,19 @@ const (
// SRIOVDevicePluginName is the name of the device plugin pod
SRIOVDevicePluginName = "sriov-device-plugin"
)
func requireSRIOVDevices() {
sriovdevCount, err := countSRIOVDevices()
framework.ExpectNoError(err)
if sriovdevCount > 0 {
return // all good
}
msg := "this test is meant to run on a system with at least one configured VF from SRIOV device"
if framework.TestContext.RequireDevices {
framework.Failf(msg)
} else {
e2eskipper.Skipf(msg)
}
}

View File

@ -0,0 +1,36 @@
//go:build linux
// +build linux
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2enode
import (
"os/exec"
"strconv"
"strings"
)
// countSRIOVDevices provides a rough estimate of SRIOV Virtual Functions available on the system.
// This is a rough check we use to rule out unsuitable systems, not to detect suitable systems.
func countSRIOVDevices() (int, error) {
outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/physfn | wc -w").Output()
if err != nil {
return -1, err
}
return strconv.Atoi(strings.TrimSpace(string(outData)))
}

View File

@ -0,0 +1,24 @@
//go:build !linux
// +build !linux
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2enode
func countSRIOVDevices() (int, error) {
return 0, nil
}