e2e: topomgr: initial negative tests

Negative tests is when we request a gu Pod we know the system cannot
fullfill - hence we expect rejection from the topology manager.

Unfortunately, besides the trivial case of excessive cores (request
more socket than a NUMA node provides) we cannot easily test the
devices, because crafting a proper pod will require detailed knowledge
of the hw topology.

Let's consider a hypotetical two-node NUMA system with two PCIe busses,
one per NUMA node, with a SRIOV device on each bus.
A proper negative test would require two SRIOV device, that the system
can provide but not on the same single NUMA node.
Requiring for example three devices (one more than the system provides)
will lead to a different, legitimate admission error.

For these reasons we bootstrap the testing infra for the negative tests,
but we add just the simplest one.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2020-02-03 18:15:26 +01:00
parent ee92b4aae0
commit d9d652e867

View File

@ -46,7 +46,8 @@ import (
)
const (
numalignCmd = `export CPULIST_ALLOWED=$( awk -F":\t*" '/Cpus_allowed_list/ { print $2 }' /proc/self/status); env; sleep 1d`
numalignCmd = `export CPULIST_ALLOWED=$( awk -F":\t*" '/Cpus_allowed_list/ { print $2 }' /proc/self/status); env; sleep 1d`
topologyError = "Topology Affinity Error" // XXX do we have a proper constant?
)
// Helper for makeTopologyManagerPod().
@ -470,7 +471,7 @@ func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
}
func runTopologyManagerNodeAlignmentTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
ginkgo.By(fmt.Sprintf("allocate aligned resources for a %d pod(s): cpuAmount=%s %s=%s", numPods, cpuAmount, sriovResourceName, deviceAmount))
var pods []*v1.Pod
@ -508,6 +509,45 @@ func runTopologyManagerNodeAlignmentTest(f *framework.Framework, numaNodes, numP
}
}
func runTopologyManagerNegativeTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
ginkgo.By(fmt.Sprintf("allocate aligned resources for a %d pod(s): cpuAmount=%s %s=%s", numPods, cpuAmount, sriovResourceName, deviceAmount))
ctnAttrs := []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: cpuAmount,
cpuLimit: cpuAmount,
deviceName: sriovResourceName,
deviceRequest: deviceAmount,
deviceLimit: deviceAmount,
},
}
podName := "gu-pod"
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
pod = f.PodClient().Create(pod)
err := e2epod.WaitForPodCondition(f.ClientSet, f.Namespace.Name, pod.Name, "Failed", 30*time.Second, func(pod *v1.Pod) (bool, error) {
if pod.Status.Phase != v1.PodPending {
return true, nil
}
return false, nil
})
framework.ExpectNoError(err)
pod, err = f.PodClient().Get(context.TODO(), pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
if pod.Status.Phase != v1.PodFailed {
framework.Failf("pod %s not failed: %v", pod.Name, pod.Status)
}
if pod.Status.Reason != topologyError {
framework.Failf("pod %s failed for wrong reason: %v", pod.Name, pod.Status)
}
deletePods(f, []string{pod.Name})
}
func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, numaNodes, coreCount int) {
cmData := testfiles.ReadOrDie(SRIOVDevicePluginCMYAML)
var err error
@ -555,13 +595,17 @@ func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, numaNodes
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
framework.Logf("Successfully created device plugin pod, detected %d SRIOV device %q", sriovResourceAmount, sriovResourceName)
threadsPerCore := 1
if isHTEnabled() {
threadsPerCore = 2
}
// could have been a loop, we unroll it to explain the testcases
// simplest case: one guaranteed core, one device
runTopologyManagerNodeAlignmentTest(f, numaNodes, 1, "1000m", sriovResourceName, "1")
runTopologyManagerPositiveTest(f, numaNodes, 1, "1000m", sriovResourceName, "1")
// two guaranteed cores, one device
runTopologyManagerNodeAlignmentTest(f, numaNodes, 1, "2000m", sriovResourceName, "1")
runTopologyManagerPositiveTest(f, numaNodes, 1, "2000m", sriovResourceName, "1")
// TODO: test taking an entire NUMA node.
// to do a meaningful test, we need to know:
@ -571,12 +615,15 @@ func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, numaNodes
if sriovResourceAmount > 1 {
// no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function
// preconditions must ensure the following can be fulfilled
runTopologyManagerNodeAlignmentTest(f, numaNodes, 2, "1000m", sriovResourceName, "1")
runTopologyManagerNodeAlignmentTest(f, numaNodes, 2, "2000m", sriovResourceName, "1")
runTopologyManagerPositiveTest(f, numaNodes, 2, "1000m", sriovResourceName, "1")
runTopologyManagerPositiveTest(f, numaNodes, 2, "2000m", sriovResourceName, "1")
// testing more complex conditions require knowledge about the system cpu+bus topology
}
// overflow NUMA node capacity: cores
runTopologyManagerNegativeTest(f, numaNodes, 1, fmt.Sprintf("%dm", (1+threadsPerCore)*coreCount*1000), sriovResourceName, "1")
framework.Logf("deleting the SRIOV device plugin pod %s/%s and waiting for container %s removal",
dpPod.Namespace, dpPod.Name, dpPod.Spec.Containers[0].Name)
deletePodInNamespace(f, dpPod.Namespace, dpPod.Name)
@ -612,7 +659,7 @@ func runTopologyManagerTests(f *framework.Framework) {
ginkgo.It("run Topology Manager node alignment test suite", func() {
// this is a very rough check. We just want to rule out system that does NOT have
// any SRIOV device. A more proper check will be done in runTopologyManagerNodeAlignmentTest
// any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest
sriovdevCount := detectSRIOVDevices()
numaNodes := detectNUMANodes()
coreCount := detectCoresPerSocket()