Merge pull request #88234 from fromanirh/topomgr-e2e-tests-multicnt

e2e topology manager: single-numa-node multi container tests
This commit is contained in:
Kubernetes Prow Robot 2020-02-20 10:35:56 -08:00 committed by GitHub
commit 3ae1b0ce80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 367 additions and 105 deletions

View File

@ -19,6 +19,8 @@ package e2enode
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
@ -44,8 +46,7 @@ func (R *numaPodResources) CheckAlignment() bool {
}
}
for _, devNode := range R.PCIDevsToNUMANode {
// TODO: explain -1
if devNode != -1 && nodeNum != devNode {
if nodeNum != devNode {
return false
}
}
@ -88,7 +89,7 @@ func getCPUsPerNUMANode(nodeNum int) ([]int, error) {
return cpus.ToSlice(), nil
}
func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string, numaNodes int) (map[int]int, error) {
func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string, numaNodes int) (map[int]int, error) {
var cpuIDs []int
cpuListAllowedEnvVar := "CPULIST_ALLOWED"
@ -102,12 +103,12 @@ func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map
}
}
if len(cpuIDs) == 0 {
return nil, fmt.Errorf("variable %q found in environ", cpuListAllowedEnvVar)
return nil, fmt.Errorf("variable %q not found in environ", cpuListAllowedEnvVar)
}
cpusPerNUMA := make(map[int][]int)
for numaNode := 0; numaNode < numaNodes; numaNode++ {
nodeCPUList := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name,
nodeCPUList := f.ExecCommandInContainer(pod.Name, cnt.Name,
"/bin/cat", fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", numaNode))
cpus, err := cpuset.Parse(nodeCPUList)
@ -137,7 +138,7 @@ func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map
return CPUMap, nil
}
func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string) (map[string]int, error) {
func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string) (map[string]int, error) {
pciDevPrefix := "PCIDEVICE_"
// at this point we don't care which plugin selected the device,
// we only need to know which devices were assigned to the POD.
@ -152,19 +153,11 @@ func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, envir
// a single plugin can allocate more than a single device
pciDevs := strings.Split(value, ",")
for _, pciDev := range pciDevs {
pciDevNUMANode := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name,
pciDevNUMANode := f.ExecCommandInContainer(pod.Name, cnt.Name,
"/bin/cat", fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", pciDev))
nodeNum, err := strconv.Atoi(pciDevNUMANode)
if err != nil {
return nil, err
}
NUMAPerDev[pciDev] = nodeNum
NUMAPerDev[pciDev] = numaNodeFromSysFsEntry(pciDevNUMANode)
}
}
if len(NUMAPerDev) == 0 {
return nil, fmt.Errorf("no PCI devices found in environ")
}
return NUMAPerDev, nil
}
@ -184,29 +177,97 @@ func makeEnvMap(logs string) (map[string]string, error) {
return envMap, nil
}
func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, logs string, numaNodes int) (numaPodResources, error) {
type testEnvInfo struct {
numaNodes int
sriovResourceName string
policy string
}
func containerWantsDevices(cnt *v1.Container, envInfo *testEnvInfo) bool {
_, found := cnt.Resources.Requests[v1.ResourceName(envInfo.sriovResourceName)]
return found
}
func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, logs string, envInfo *testEnvInfo) (*numaPodResources, error) {
var err error
podEnv, err := makeEnvMap(logs)
if err != nil {
return numaPodResources{}, err
return nil, err
}
CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, podEnv, numaNodes)
CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, cnt, podEnv, envInfo.numaNodes)
if err != nil {
return numaPodResources{}, err
return nil, err
}
PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, podEnv)
PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, cnt, podEnv)
if err != nil {
return numaPodResources{}, err
return nil, err
}
if containerWantsDevices(cnt, envInfo) && len(PCIDevsToNUMANode) == 0 {
return nil, fmt.Errorf("no PCI devices found in environ")
}
numaRes := numaPodResources{
CPUToNUMANode: CPUToNUMANode,
PCIDevsToNUMANode: PCIDevsToNUMANode,
}
aligned := numaRes.CheckAlignment()
if !aligned {
return numaRes, fmt.Errorf("NUMA resources not aligned")
err = fmt.Errorf("NUMA resources not aligned")
}
return numaRes, nil
return &numaRes, err
}
type pciDeviceInfo struct {
Address string
NUMANode int
IsPhysFn bool
IsVFn bool
}
func getPCIDeviceInfo(sysPCIDir string) ([]pciDeviceInfo, error) {
var pciDevs []pciDeviceInfo
entries, err := ioutil.ReadDir(sysPCIDir)
if err != nil {
return nil, err
}
for _, entry := range entries {
isPhysFn := false
isVFn := false
if _, err := os.Stat(filepath.Join(sysPCIDir, entry.Name(), "sriov_numvfs")); err == nil {
isPhysFn = true
} else if !os.IsNotExist(err) {
// unexpected error. Bail out
return nil, err
}
if _, err := os.Stat(filepath.Join(sysPCIDir, entry.Name(), "physfn")); err == nil {
isVFn = true
} else if !os.IsNotExist(err) {
// unexpected error. Bail out
return nil, err
}
content, err := ioutil.ReadFile(filepath.Join(sysPCIDir, entry.Name(), "numa_node"))
if err != nil {
return nil, err
}
pciDevs = append(pciDevs, pciDeviceInfo{
Address: entry.Name(),
NUMANode: numaNodeFromSysFsEntry(string(content)),
IsPhysFn: isPhysFn,
IsVFn: isVFn,
})
}
return pciDevs, nil
}
func numaNodeFromSysFsEntry(content string) int {
nodeNum, err := strconv.Atoi(strings.TrimSpace(content))
framework.ExpectNoError(err, "error detecting the device numa_node from sysfs: %v", err)
return nodeNum
}

View File

@ -31,10 +31,12 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@ -83,7 +85,7 @@ func detectCoresPerSocket() int {
}
func detectSRIOVDevices() int {
outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/sriov_totalvfs | wc -w").Output()
outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/physfn | wc -w").Output()
framework.ExpectNoError(err)
devCount, err := strconv.Atoi(strings.TrimSpace(string(outData)))
@ -134,7 +136,7 @@ func makeTopologyManagerTestPod(podName, podCmd string, tmCtnAttributes []tmCtnA
}
}
func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
func findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
for nodeNum := 0; nodeNum < numaNodes; nodeNum++ {
value, ok := configMap.Annotations[fmt.Sprintf("pcidevice_node%d", nodeNum)]
if !ok {
@ -154,6 +156,46 @@ func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (in
return -1, false
}
func findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes int) (int, bool) {
pciDevs, err := getPCIDeviceInfo("/sys/bus/pci/devices")
if err != nil {
framework.Failf("error detecting the PCI device NUMA node: %v", err)
}
pciPerNuma := make(map[int]int)
for _, pciDev := range pciDevs {
if pciDev.IsVFn {
pciPerNuma[pciDev.NUMANode]++
}
}
if len(pciPerNuma) == 0 {
// if we got this far we already passed a rough check that SRIOV devices
// are available in the box, so something is seriously wrong
framework.Failf("failed to find any VF devices from %v", pciDevs)
}
for nodeNum := 0; nodeNum < numaNodes; nodeNum++ {
v := pciPerNuma[nodeNum]
if v == 0 {
framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum)
return nodeNum, true
}
framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v)
}
return -1, false
}
func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
// if someone annotated the configMap, let's use this information
if nodeNum, found := findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap, numaNodes); found {
return nodeNum, found
}
// no annotations, try to autodetect
// NOTE: this assumes all the VFs in the box can be used for the tests.
return findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes)
}
func configureTopologyManagerInKubelet(f *framework.Framework, oldCfg *kubeletconfig.KubeletConfiguration, policy string, configMap *v1.ConfigMap, numaNodes int) string {
// Configure Topology Manager in Kubelet with policy.
newCfg := oldCfg.DeepCopy()
@ -255,25 +297,20 @@ func findSRIOVResource(node *v1.Node) (string, int64) {
return "", 0
}
func deletePodInNamespace(f *framework.Framework, namespace, name string) {
gp := int64(0)
deleteOptions := metav1.DeleteOptions{
GracePeriodSeconds: &gp,
func validatePodAlignment(f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) {
for _, cnt := range pod.Spec.Containers {
ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name))
logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", cnt.Name, pod.Name)
framework.Logf("got pod logs: %v", logs)
numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, envInfo)
framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]", cnt.Name, pod.Name)
if numaRes != nil {
framework.Logf("NUMA resources for %s/%s: %s", pod.Name, cnt.Name, numaRes.String())
}
}
err := f.ClientSet.CoreV1().Pods(namespace).Delete(context.TODO(), name, &deleteOptions)
framework.ExpectNoError(err)
}
func validatePodAlignment(f *framework.Framework, pod *v1.Pod, numaNodes int) {
ginkgo.By("validating the Gu pod")
logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
pod.Spec.Containers[0].Name, pod.Name)
framework.Logf("got pod logs: %v", logs)
numaRes, err := checkNUMAAlignment(f, pod, logs, numaNodes)
framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]: %s",
pod.Spec.Containers[0].Name, pod.Name, numaRes.String())
}
func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
@ -502,21 +539,27 @@ func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
}
func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
func waitForAllContainerRemoval(podName, podNS string) {
rs, _, err := getCRIClient()
framework.ExpectNoError(err)
gomega.Eventually(func() bool {
containers, err := rs.ListContainers(&runtimeapi.ContainerFilter{
LabelSelector: map[string]string{
types.KubernetesPodNameLabel: podName,
types.KubernetesPodNamespaceLabel: podNS,
},
})
if err != nil {
return false
}
return len(containers) == 0
}, 2*time.Minute, 1*time.Second).Should(gomega.BeTrue())
}
func runTopologyManagerPositiveTest(f *framework.Framework, numPods int, ctnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
var pods []*v1.Pod
for podID := 0; podID < numPods; podID++ {
ctnAttrs := []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: cpuAmount,
cpuLimit: cpuAmount,
deviceName: sriovResourceName,
deviceRequest: deviceAmount,
deviceLimit: deviceAmount,
},
}
podName := fmt.Sprintf("gu-pod-%d", podID)
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
@ -525,31 +568,24 @@ func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods i
pods = append(pods, pod)
}
for podID := 0; podID < numPods; podID++ {
validatePodAlignment(f, pods[podID], numaNodes)
// per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/0035-20190130-topology-manager.md#multi-numa-systems-tests
// we can do a menaingful validation only when using the single-numa node policy
if envInfo.policy == topologymanager.PolicySingleNumaNode {
for podID := 0; podID < numPods; podID++ {
validatePodAlignment(f, pods[podID], envInfo)
}
}
for podID := 0; podID < numPods; podID++ {
pod := pods[podID]
framework.Logf("deleting the pod %s/%s and waiting for container %s removal",
pod.Namespace, pod.Name, pod.Spec.Containers[0].Name)
framework.Logf("deleting the pod %s/%s and waiting for container removal",
pod.Namespace, pod.Name)
deletePods(f, []string{pod.Name})
waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
waitForAllContainerRemoval(pod.Name, pod.Namespace)
}
}
func runTopologyManagerNegativeTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
ctnAttrs := []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: cpuAmount,
cpuLimit: cpuAmount,
deviceName: sriovResourceName,
deviceRequest: deviceAmount,
deviceLimit: deviceAmount,
},
}
func runTopologyManagerNegativeTest(f *framework.Framework, numPods int, ctnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
podName := "gu-pod"
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
@ -598,7 +634,16 @@ func getSRIOVDevicePluginConfigMap(cmFile string) *v1.ConfigMap {
return readConfigMapV1OrDie(cmData)
}
func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) (*v1.Pod, string, int64) {
type sriovData struct {
configMap *v1.ConfigMap
serviceAccount *v1.ServiceAccount
pod *v1.Pod
resourceName string
resourceAmount int64
}
func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sriovData {
var err error
ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name))
@ -632,59 +677,212 @@ func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) (*v
}, 2*time.Minute, framework.Poll).Should(gomega.BeTrue())
framework.Logf("Successfully created device plugin pod, detected %d SRIOV device %q", sriovResourceAmount, sriovResourceName)
return dpPod, sriovResourceName, sriovResourceAmount
return &sriovData{
configMap: configMap,
serviceAccount: serviceAccount,
pod: dpPod,
resourceName: sriovResourceName,
resourceAmount: sriovResourceAmount,
}
}
func teardownSRIOVConfigOrFail(f *framework.Framework, dpPod *v1.Pod) {
framework.Logf("deleting the SRIOV device plugin pod %s/%s and waiting for container %s removal",
dpPod.Namespace, dpPod.Name, dpPod.Spec.Containers[0].Name)
deletePodInNamespace(f, dpPod.Namespace, dpPod.Name)
waitForContainerRemoval(dpPod.Spec.Containers[0].Name, dpPod.Name, dpPod.Namespace)
func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
var err error
gp := int64(0)
deleteOptions := metav1.DeleteOptions{
GracePeriodSeconds: &gp,
}
ginkgo.By("Delete SRIOV device plugin pod %s/%s")
err = f.ClientSet.CoreV1().Pods(sd.pod.Namespace).Delete(context.TODO(), sd.pod.Name, &deleteOptions)
framework.ExpectNoError(err)
waitForContainerRemoval(sd.pod.Spec.Containers[0].Name, sd.pod.Name, sd.pod.Namespace)
ginkgo.By(fmt.Sprintf("Deleting configMap %v/%v", metav1.NamespaceSystem, sd.configMap.Name))
err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(context.TODO(), sd.configMap.Name, &deleteOptions)
framework.ExpectNoError(err)
ginkgo.By(fmt.Sprintf("Deleting serviceAccount %v/%v", metav1.NamespaceSystem, sd.serviceAccount.Name))
err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Delete(context.TODO(), sd.serviceAccount.Name, &deleteOptions)
framework.ExpectNoError(err)
}
func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs string, numaNodes, coreCount int) {
func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs string, numaNodes, coreCount int, policy string) {
threadsPerCore := 1
if isHTEnabled() {
threadsPerCore = 2
}
dpPod, sriovResourceName, sriovResourceAmount := setupSRIOVConfigOrFail(f, configMap)
sd := setupSRIOVConfigOrFail(f, configMap)
envInfo := &testEnvInfo{
numaNodes: numaNodes,
sriovResourceName: sd.resourceName,
policy: policy,
}
// could have been a loop, we unroll it to explain the testcases
var ctnAttrs []tmCtnAttribute
// simplest case
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sriovResourceName))
runTopologyManagerPositiveTest(f, numaNodes, 1, "1000m", sriovResourceName, "1")
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sriovResourceName))
runTopologyManagerPositiveTest(f, numaNodes, 1, "2000m", sriovResourceName, "1")
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "2000m",
cpuLimit: "2000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
if reservedSystemCPUs != "" {
// to avoid false negatives, we have put reserved CPUs in such a way there is at least a NUMA node
// with 1+ SRIOV devices and not reserved CPUs.
numCores := threadsPerCore * coreCount
ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sriovResourceName))
runTopologyManagerPositiveTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1")
allCoresReq := fmt.Sprintf("%dm", numCores*1000)
ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: allCoresReq,
cpuLimit: allCoresReq,
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
}
if sriovResourceAmount > 1 {
if sd.resourceAmount > 1 {
// no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function
// preconditions must ensure the following can be fulfilled
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sriovResourceName))
runTopologyManagerPositiveTest(f, numaNodes, 2, "1000m", sriovResourceName, "1")
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sriovResourceName))
runTopologyManagerPositiveTest(f, numaNodes, 2, "2000m", sriovResourceName, "1")
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "2000m",
cpuLimit: "2000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
// testing more complex conditions require knowledge about the system cpu+bus topology
}
// overflow NUMA node capacity: cores
numCores := 1 + (threadsPerCore * coreCount)
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sriovResourceName))
runTopologyManagerNegativeTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1")
// multi-container tests
if sd.resourceAmount >= 4 {
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pods, each with two containers, each with 2 cores, 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container-0",
cpuRequest: "2000m",
cpuLimit: "2000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
{
ctnName: "gu-container-1",
cpuRequest: "2000m",
cpuLimit: "2000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
teardownSRIOVConfigOrFail(f, dpPod)
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, each with 1 core, 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container-0",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
{
ctnName: "gu-container-1",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, both with with 2 cores, one with 1 %s device", sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container-dev",
cpuRequest: "2000m",
cpuLimit: "2000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
{
ctnName: "gu-container-nodev",
cpuRequest: "2000m",
cpuLimit: "2000m",
},
}
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
}
// this is the only policy that can guarantee reliable rejects
if policy == topologymanager.PolicySingleNumaNode {
// overflow NUMA node capacity: cores
numCores := 1 + (threadsPerCore * coreCount)
excessCoresReq := fmt.Sprintf("%dm", numCores*1000)
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sd.resourceName))
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: excessCoresReq,
cpuLimit: excessCoresReq,
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerNegativeTest(f, 1, ctnAttrs, envInfo)
}
teardownSRIOVConfigOrFail(f, sd)
}
func runTopologyManagerTests(f *framework.Framework) {
@ -728,7 +926,7 @@ func runTopologyManagerTests(f *framework.Framework) {
e2eskipper.Skipf("this test is meant to run on a system with at least 4 cores per socket")
}
if sriovdevCount == 0 {
e2eskipper.Skipf("this test is meant to run on a system with at least one SRIOV device")
e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device")
}
configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
@ -736,15 +934,18 @@ func runTopologyManagerTests(f *framework.Framework) {
oldCfg, err = getCurrentKubeletConfig()
framework.ExpectNoError(err)
policy := topologymanager.PolicySingleNumaNode
var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted,
topologymanager.PolicyBestEffort, topologymanager.PolicyNone}
// Configure Topology Manager
ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy))
framework.Logf("Configuring topology Manager policy to %s", policy)
for _, policy := range policies {
// Configure Topology Manager
ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy))
framework.Logf("Configuring topology Manager policy to %s", policy)
reservedSystemCPUs := configureTopologyManagerInKubelet(f, oldCfg, policy, configMap, numaNodes)
reservedSystemCPUs := configureTopologyManagerInKubelet(f, oldCfg, policy, configMap, numaNodes)
runTopologyManagerNodeAlignmentSuiteTests(f, configMap, reservedSystemCPUs, numaNodes, coreCount)
runTopologyManagerNodeAlignmentSuiteTests(f, configMap, reservedSystemCPUs, numaNodes, coreCount, policy)
}
// restore kubelet config
setOldKubeletConfig(f, oldCfg)