mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 12:15:52 +00:00
Merge pull request #88234 from fromanirh/topomgr-e2e-tests-multicnt
e2e topology manager: single-numa-node multi container tests
This commit is contained in:
commit
3ae1b0ce80
@ -19,6 +19,8 @@ package e2enode
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -44,8 +46,7 @@ func (R *numaPodResources) CheckAlignment() bool {
|
||||
}
|
||||
}
|
||||
for _, devNode := range R.PCIDevsToNUMANode {
|
||||
// TODO: explain -1
|
||||
if devNode != -1 && nodeNum != devNode {
|
||||
if nodeNum != devNode {
|
||||
return false
|
||||
}
|
||||
}
|
||||
@ -88,7 +89,7 @@ func getCPUsPerNUMANode(nodeNum int) ([]int, error) {
|
||||
return cpus.ToSlice(), nil
|
||||
}
|
||||
|
||||
func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string, numaNodes int) (map[int]int, error) {
|
||||
func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string, numaNodes int) (map[int]int, error) {
|
||||
var cpuIDs []int
|
||||
cpuListAllowedEnvVar := "CPULIST_ALLOWED"
|
||||
|
||||
@ -102,12 +103,12 @@ func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map
|
||||
}
|
||||
}
|
||||
if len(cpuIDs) == 0 {
|
||||
return nil, fmt.Errorf("variable %q found in environ", cpuListAllowedEnvVar)
|
||||
return nil, fmt.Errorf("variable %q not found in environ", cpuListAllowedEnvVar)
|
||||
}
|
||||
|
||||
cpusPerNUMA := make(map[int][]int)
|
||||
for numaNode := 0; numaNode < numaNodes; numaNode++ {
|
||||
nodeCPUList := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name,
|
||||
nodeCPUList := f.ExecCommandInContainer(pod.Name, cnt.Name,
|
||||
"/bin/cat", fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", numaNode))
|
||||
|
||||
cpus, err := cpuset.Parse(nodeCPUList)
|
||||
@ -137,7 +138,7 @@ func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map
|
||||
return CPUMap, nil
|
||||
}
|
||||
|
||||
func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string) (map[string]int, error) {
|
||||
func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string) (map[string]int, error) {
|
||||
pciDevPrefix := "PCIDEVICE_"
|
||||
// at this point we don't care which plugin selected the device,
|
||||
// we only need to know which devices were assigned to the POD.
|
||||
@ -152,19 +153,11 @@ func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, envir
|
||||
// a single plugin can allocate more than a single device
|
||||
pciDevs := strings.Split(value, ",")
|
||||
for _, pciDev := range pciDevs {
|
||||
pciDevNUMANode := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name,
|
||||
pciDevNUMANode := f.ExecCommandInContainer(pod.Name, cnt.Name,
|
||||
"/bin/cat", fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", pciDev))
|
||||
|
||||
nodeNum, err := strconv.Atoi(pciDevNUMANode)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
NUMAPerDev[pciDev] = nodeNum
|
||||
NUMAPerDev[pciDev] = numaNodeFromSysFsEntry(pciDevNUMANode)
|
||||
}
|
||||
}
|
||||
if len(NUMAPerDev) == 0 {
|
||||
return nil, fmt.Errorf("no PCI devices found in environ")
|
||||
}
|
||||
return NUMAPerDev, nil
|
||||
}
|
||||
|
||||
@ -184,29 +177,97 @@ func makeEnvMap(logs string) (map[string]string, error) {
|
||||
return envMap, nil
|
||||
}
|
||||
|
||||
func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, logs string, numaNodes int) (numaPodResources, error) {
|
||||
type testEnvInfo struct {
|
||||
numaNodes int
|
||||
sriovResourceName string
|
||||
policy string
|
||||
}
|
||||
|
||||
func containerWantsDevices(cnt *v1.Container, envInfo *testEnvInfo) bool {
|
||||
_, found := cnt.Resources.Requests[v1.ResourceName(envInfo.sriovResourceName)]
|
||||
return found
|
||||
}
|
||||
|
||||
func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, logs string, envInfo *testEnvInfo) (*numaPodResources, error) {
|
||||
var err error
|
||||
podEnv, err := makeEnvMap(logs)
|
||||
if err != nil {
|
||||
return numaPodResources{}, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, podEnv, numaNodes)
|
||||
CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, cnt, podEnv, envInfo.numaNodes)
|
||||
if err != nil {
|
||||
return numaPodResources{}, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, podEnv)
|
||||
PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, cnt, podEnv)
|
||||
if err != nil {
|
||||
return numaPodResources{}, err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if containerWantsDevices(cnt, envInfo) && len(PCIDevsToNUMANode) == 0 {
|
||||
return nil, fmt.Errorf("no PCI devices found in environ")
|
||||
}
|
||||
numaRes := numaPodResources{
|
||||
CPUToNUMANode: CPUToNUMANode,
|
||||
PCIDevsToNUMANode: PCIDevsToNUMANode,
|
||||
}
|
||||
aligned := numaRes.CheckAlignment()
|
||||
if !aligned {
|
||||
return numaRes, fmt.Errorf("NUMA resources not aligned")
|
||||
err = fmt.Errorf("NUMA resources not aligned")
|
||||
}
|
||||
return numaRes, nil
|
||||
return &numaRes, err
|
||||
}
|
||||
|
||||
type pciDeviceInfo struct {
|
||||
Address string
|
||||
NUMANode int
|
||||
IsPhysFn bool
|
||||
IsVFn bool
|
||||
}
|
||||
|
||||
func getPCIDeviceInfo(sysPCIDir string) ([]pciDeviceInfo, error) {
|
||||
var pciDevs []pciDeviceInfo
|
||||
|
||||
entries, err := ioutil.ReadDir(sysPCIDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
isPhysFn := false
|
||||
isVFn := false
|
||||
if _, err := os.Stat(filepath.Join(sysPCIDir, entry.Name(), "sriov_numvfs")); err == nil {
|
||||
isPhysFn = true
|
||||
} else if !os.IsNotExist(err) {
|
||||
// unexpected error. Bail out
|
||||
return nil, err
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(sysPCIDir, entry.Name(), "physfn")); err == nil {
|
||||
isVFn = true
|
||||
} else if !os.IsNotExist(err) {
|
||||
// unexpected error. Bail out
|
||||
return nil, err
|
||||
}
|
||||
|
||||
content, err := ioutil.ReadFile(filepath.Join(sysPCIDir, entry.Name(), "numa_node"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pciDevs = append(pciDevs, pciDeviceInfo{
|
||||
Address: entry.Name(),
|
||||
NUMANode: numaNodeFromSysFsEntry(string(content)),
|
||||
IsPhysFn: isPhysFn,
|
||||
IsVFn: isVFn,
|
||||
})
|
||||
}
|
||||
|
||||
return pciDevs, nil
|
||||
}
|
||||
|
||||
func numaNodeFromSysFsEntry(content string) int {
|
||||
nodeNum, err := strconv.Atoi(strings.TrimSpace(content))
|
||||
framework.ExpectNoError(err, "error detecting the device numa_node from sysfs: %v", err)
|
||||
return nodeNum
|
||||
}
|
||||
|
@ -31,10 +31,12 @@ import (
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/types"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
@ -83,7 +85,7 @@ func detectCoresPerSocket() int {
|
||||
}
|
||||
|
||||
func detectSRIOVDevices() int {
|
||||
outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/sriov_totalvfs | wc -w").Output()
|
||||
outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/physfn | wc -w").Output()
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
devCount, err := strconv.Atoi(strings.TrimSpace(string(outData)))
|
||||
@ -134,7 +136,7 @@ func makeTopologyManagerTestPod(podName, podCmd string, tmCtnAttributes []tmCtnA
|
||||
}
|
||||
}
|
||||
|
||||
func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
|
||||
func findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
|
||||
for nodeNum := 0; nodeNum < numaNodes; nodeNum++ {
|
||||
value, ok := configMap.Annotations[fmt.Sprintf("pcidevice_node%d", nodeNum)]
|
||||
if !ok {
|
||||
@ -154,6 +156,46 @@ func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (in
|
||||
return -1, false
|
||||
}
|
||||
|
||||
func findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes int) (int, bool) {
|
||||
pciDevs, err := getPCIDeviceInfo("/sys/bus/pci/devices")
|
||||
if err != nil {
|
||||
framework.Failf("error detecting the PCI device NUMA node: %v", err)
|
||||
}
|
||||
|
||||
pciPerNuma := make(map[int]int)
|
||||
for _, pciDev := range pciDevs {
|
||||
if pciDev.IsVFn {
|
||||
pciPerNuma[pciDev.NUMANode]++
|
||||
}
|
||||
}
|
||||
|
||||
if len(pciPerNuma) == 0 {
|
||||
// if we got this far we already passed a rough check that SRIOV devices
|
||||
// are available in the box, so something is seriously wrong
|
||||
framework.Failf("failed to find any VF devices from %v", pciDevs)
|
||||
}
|
||||
|
||||
for nodeNum := 0; nodeNum < numaNodes; nodeNum++ {
|
||||
v := pciPerNuma[nodeNum]
|
||||
if v == 0 {
|
||||
framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum)
|
||||
return nodeNum, true
|
||||
}
|
||||
framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v)
|
||||
}
|
||||
return -1, false
|
||||
}
|
||||
|
||||
func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
|
||||
// if someone annotated the configMap, let's use this information
|
||||
if nodeNum, found := findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap, numaNodes); found {
|
||||
return nodeNum, found
|
||||
}
|
||||
// no annotations, try to autodetect
|
||||
// NOTE: this assumes all the VFs in the box can be used for the tests.
|
||||
return findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes)
|
||||
}
|
||||
|
||||
func configureTopologyManagerInKubelet(f *framework.Framework, oldCfg *kubeletconfig.KubeletConfiguration, policy string, configMap *v1.ConfigMap, numaNodes int) string {
|
||||
// Configure Topology Manager in Kubelet with policy.
|
||||
newCfg := oldCfg.DeepCopy()
|
||||
@ -255,25 +297,20 @@ func findSRIOVResource(node *v1.Node) (string, int64) {
|
||||
return "", 0
|
||||
}
|
||||
|
||||
func deletePodInNamespace(f *framework.Framework, namespace, name string) {
|
||||
gp := int64(0)
|
||||
deleteOptions := metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &gp,
|
||||
func validatePodAlignment(f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) {
|
||||
for _, cnt := range pod.Spec.Containers {
|
||||
ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name))
|
||||
|
||||
logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
|
||||
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", cnt.Name, pod.Name)
|
||||
|
||||
framework.Logf("got pod logs: %v", logs)
|
||||
numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, envInfo)
|
||||
framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]", cnt.Name, pod.Name)
|
||||
if numaRes != nil {
|
||||
framework.Logf("NUMA resources for %s/%s: %s", pod.Name, cnt.Name, numaRes.String())
|
||||
}
|
||||
}
|
||||
err := f.ClientSet.CoreV1().Pods(namespace).Delete(context.TODO(), name, &deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
}
|
||||
|
||||
func validatePodAlignment(f *framework.Framework, pod *v1.Pod, numaNodes int) {
|
||||
ginkgo.By("validating the Gu pod")
|
||||
logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name)
|
||||
framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
|
||||
pod.Spec.Containers[0].Name, pod.Name)
|
||||
|
||||
framework.Logf("got pod logs: %v", logs)
|
||||
numaRes, err := checkNUMAAlignment(f, pod, logs, numaNodes)
|
||||
framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]: %s",
|
||||
pod.Spec.Containers[0].Name, pod.Name, numaRes.String())
|
||||
}
|
||||
|
||||
func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
|
||||
@ -502,21 +539,27 @@ func runTopologyManagerPolicySuiteTests(f *framework.Framework) {
|
||||
waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
|
||||
}
|
||||
|
||||
func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
|
||||
func waitForAllContainerRemoval(podName, podNS string) {
|
||||
rs, _, err := getCRIClient()
|
||||
framework.ExpectNoError(err)
|
||||
gomega.Eventually(func() bool {
|
||||
containers, err := rs.ListContainers(&runtimeapi.ContainerFilter{
|
||||
LabelSelector: map[string]string{
|
||||
types.KubernetesPodNameLabel: podName,
|
||||
types.KubernetesPodNamespaceLabel: podNS,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return len(containers) == 0
|
||||
}, 2*time.Minute, 1*time.Second).Should(gomega.BeTrue())
|
||||
}
|
||||
|
||||
func runTopologyManagerPositiveTest(f *framework.Framework, numPods int, ctnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
|
||||
var pods []*v1.Pod
|
||||
|
||||
for podID := 0; podID < numPods; podID++ {
|
||||
ctnAttrs := []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: cpuAmount,
|
||||
cpuLimit: cpuAmount,
|
||||
deviceName: sriovResourceName,
|
||||
deviceRequest: deviceAmount,
|
||||
deviceLimit: deviceAmount,
|
||||
},
|
||||
}
|
||||
|
||||
podName := fmt.Sprintf("gu-pod-%d", podID)
|
||||
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
|
||||
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
|
||||
@ -525,31 +568,24 @@ func runTopologyManagerPositiveTest(f *framework.Framework, numaNodes, numPods i
|
||||
pods = append(pods, pod)
|
||||
}
|
||||
|
||||
for podID := 0; podID < numPods; podID++ {
|
||||
validatePodAlignment(f, pods[podID], numaNodes)
|
||||
// per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/0035-20190130-topology-manager.md#multi-numa-systems-tests
|
||||
// we can do a menaingful validation only when using the single-numa node policy
|
||||
if envInfo.policy == topologymanager.PolicySingleNumaNode {
|
||||
for podID := 0; podID < numPods; podID++ {
|
||||
validatePodAlignment(f, pods[podID], envInfo)
|
||||
}
|
||||
}
|
||||
|
||||
for podID := 0; podID < numPods; podID++ {
|
||||
pod := pods[podID]
|
||||
framework.Logf("deleting the pod %s/%s and waiting for container %s removal",
|
||||
pod.Namespace, pod.Name, pod.Spec.Containers[0].Name)
|
||||
framework.Logf("deleting the pod %s/%s and waiting for container removal",
|
||||
pod.Namespace, pod.Name)
|
||||
deletePods(f, []string{pod.Name})
|
||||
waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
|
||||
waitForAllContainerRemoval(pod.Name, pod.Namespace)
|
||||
}
|
||||
}
|
||||
|
||||
func runTopologyManagerNegativeTest(f *framework.Framework, numaNodes, numPods int, cpuAmount, sriovResourceName, deviceAmount string) {
|
||||
ctnAttrs := []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: cpuAmount,
|
||||
cpuLimit: cpuAmount,
|
||||
deviceName: sriovResourceName,
|
||||
deviceRequest: deviceAmount,
|
||||
deviceLimit: deviceAmount,
|
||||
},
|
||||
}
|
||||
|
||||
func runTopologyManagerNegativeTest(f *framework.Framework, numPods int, ctnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
|
||||
podName := "gu-pod"
|
||||
framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
|
||||
pod := makeTopologyManagerTestPod(podName, numalignCmd, ctnAttrs)
|
||||
@ -598,7 +634,16 @@ func getSRIOVDevicePluginConfigMap(cmFile string) *v1.ConfigMap {
|
||||
return readConfigMapV1OrDie(cmData)
|
||||
}
|
||||
|
||||
func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) (*v1.Pod, string, int64) {
|
||||
type sriovData struct {
|
||||
configMap *v1.ConfigMap
|
||||
serviceAccount *v1.ServiceAccount
|
||||
pod *v1.Pod
|
||||
|
||||
resourceName string
|
||||
resourceAmount int64
|
||||
}
|
||||
|
||||
func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sriovData {
|
||||
var err error
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name))
|
||||
@ -632,59 +677,212 @@ func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) (*v
|
||||
}, 2*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
framework.Logf("Successfully created device plugin pod, detected %d SRIOV device %q", sriovResourceAmount, sriovResourceName)
|
||||
|
||||
return dpPod, sriovResourceName, sriovResourceAmount
|
||||
return &sriovData{
|
||||
configMap: configMap,
|
||||
serviceAccount: serviceAccount,
|
||||
pod: dpPod,
|
||||
resourceName: sriovResourceName,
|
||||
resourceAmount: sriovResourceAmount,
|
||||
}
|
||||
}
|
||||
|
||||
func teardownSRIOVConfigOrFail(f *framework.Framework, dpPod *v1.Pod) {
|
||||
framework.Logf("deleting the SRIOV device plugin pod %s/%s and waiting for container %s removal",
|
||||
dpPod.Namespace, dpPod.Name, dpPod.Spec.Containers[0].Name)
|
||||
deletePodInNamespace(f, dpPod.Namespace, dpPod.Name)
|
||||
waitForContainerRemoval(dpPod.Spec.Containers[0].Name, dpPod.Name, dpPod.Namespace)
|
||||
func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
|
||||
var err error
|
||||
gp := int64(0)
|
||||
deleteOptions := metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &gp,
|
||||
}
|
||||
|
||||
ginkgo.By("Delete SRIOV device plugin pod %s/%s")
|
||||
err = f.ClientSet.CoreV1().Pods(sd.pod.Namespace).Delete(context.TODO(), sd.pod.Name, &deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
waitForContainerRemoval(sd.pod.Spec.Containers[0].Name, sd.pod.Name, sd.pod.Namespace)
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Deleting configMap %v/%v", metav1.NamespaceSystem, sd.configMap.Name))
|
||||
err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(context.TODO(), sd.configMap.Name, &deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Deleting serviceAccount %v/%v", metav1.NamespaceSystem, sd.serviceAccount.Name))
|
||||
err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Delete(context.TODO(), sd.serviceAccount.Name, &deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
}
|
||||
|
||||
func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs string, numaNodes, coreCount int) {
|
||||
func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs string, numaNodes, coreCount int, policy string) {
|
||||
threadsPerCore := 1
|
||||
if isHTEnabled() {
|
||||
threadsPerCore = 2
|
||||
}
|
||||
|
||||
dpPod, sriovResourceName, sriovResourceAmount := setupSRIOVConfigOrFail(f, configMap)
|
||||
sd := setupSRIOVConfigOrFail(f, configMap)
|
||||
envInfo := &testEnvInfo{
|
||||
numaNodes: numaNodes,
|
||||
sriovResourceName: sd.resourceName,
|
||||
policy: policy,
|
||||
}
|
||||
|
||||
// could have been a loop, we unroll it to explain the testcases
|
||||
var ctnAttrs []tmCtnAttribute
|
||||
|
||||
// simplest case
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sriovResourceName))
|
||||
runTopologyManagerPositiveTest(f, numaNodes, 1, "1000m", sriovResourceName, "1")
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: "1000m",
|
||||
cpuLimit: "1000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sriovResourceName))
|
||||
runTopologyManagerPositiveTest(f, numaNodes, 1, "2000m", sriovResourceName, "1")
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: "2000m",
|
||||
cpuLimit: "2000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
|
||||
|
||||
if reservedSystemCPUs != "" {
|
||||
// to avoid false negatives, we have put reserved CPUs in such a way there is at least a NUMA node
|
||||
// with 1+ SRIOV devices and not reserved CPUs.
|
||||
numCores := threadsPerCore * coreCount
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sriovResourceName))
|
||||
runTopologyManagerPositiveTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1")
|
||||
allCoresReq := fmt.Sprintf("%dm", numCores*1000)
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: allCoresReq,
|
||||
cpuLimit: allCoresReq,
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
|
||||
}
|
||||
|
||||
if sriovResourceAmount > 1 {
|
||||
if sd.resourceAmount > 1 {
|
||||
// no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function
|
||||
// preconditions must ensure the following can be fulfilled
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sriovResourceName))
|
||||
runTopologyManagerPositiveTest(f, numaNodes, 2, "1000m", sriovResourceName, "1")
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: "1000m",
|
||||
cpuLimit: "1000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sriovResourceName))
|
||||
runTopologyManagerPositiveTest(f, numaNodes, 2, "2000m", sriovResourceName, "1")
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: "2000m",
|
||||
cpuLimit: "2000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
|
||||
|
||||
// testing more complex conditions require knowledge about the system cpu+bus topology
|
||||
}
|
||||
|
||||
// overflow NUMA node capacity: cores
|
||||
numCores := 1 + (threadsPerCore * coreCount)
|
||||
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sriovResourceName))
|
||||
runTopologyManagerNegativeTest(f, numaNodes, 1, fmt.Sprintf("%dm", numCores*1000), sriovResourceName, "1")
|
||||
// multi-container tests
|
||||
if sd.resourceAmount >= 4 {
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pods, each with two containers, each with 2 cores, 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container-0",
|
||||
cpuRequest: "2000m",
|
||||
cpuLimit: "2000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
{
|
||||
ctnName: "gu-container-1",
|
||||
cpuRequest: "2000m",
|
||||
cpuLimit: "2000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 1, ctnAttrs, envInfo)
|
||||
|
||||
teardownSRIOVConfigOrFail(f, dpPod)
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, each with 1 core, 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container-0",
|
||||
cpuRequest: "1000m",
|
||||
cpuLimit: "1000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
{
|
||||
ctnName: "gu-container-1",
|
||||
cpuRequest: "1000m",
|
||||
cpuLimit: "1000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, both with with 2 cores, one with 1 %s device", sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container-dev",
|
||||
cpuRequest: "2000m",
|
||||
cpuLimit: "2000m",
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
{
|
||||
ctnName: "gu-container-nodev",
|
||||
cpuRequest: "2000m",
|
||||
cpuLimit: "2000m",
|
||||
},
|
||||
}
|
||||
runTopologyManagerPositiveTest(f, 2, ctnAttrs, envInfo)
|
||||
}
|
||||
|
||||
// this is the only policy that can guarantee reliable rejects
|
||||
if policy == topologymanager.PolicySingleNumaNode {
|
||||
// overflow NUMA node capacity: cores
|
||||
numCores := 1 + (threadsPerCore * coreCount)
|
||||
excessCoresReq := fmt.Sprintf("%dm", numCores*1000)
|
||||
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sd.resourceName))
|
||||
ctnAttrs = []tmCtnAttribute{
|
||||
{
|
||||
ctnName: "gu-container",
|
||||
cpuRequest: excessCoresReq,
|
||||
cpuLimit: excessCoresReq,
|
||||
deviceName: sd.resourceName,
|
||||
deviceRequest: "1",
|
||||
deviceLimit: "1",
|
||||
},
|
||||
}
|
||||
runTopologyManagerNegativeTest(f, 1, ctnAttrs, envInfo)
|
||||
}
|
||||
teardownSRIOVConfigOrFail(f, sd)
|
||||
}
|
||||
|
||||
func runTopologyManagerTests(f *framework.Framework) {
|
||||
@ -728,7 +926,7 @@ func runTopologyManagerTests(f *framework.Framework) {
|
||||
e2eskipper.Skipf("this test is meant to run on a system with at least 4 cores per socket")
|
||||
}
|
||||
if sriovdevCount == 0 {
|
||||
e2eskipper.Skipf("this test is meant to run on a system with at least one SRIOV device")
|
||||
e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device")
|
||||
}
|
||||
|
||||
configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
|
||||
@ -736,15 +934,18 @@ func runTopologyManagerTests(f *framework.Framework) {
|
||||
oldCfg, err = getCurrentKubeletConfig()
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
policy := topologymanager.PolicySingleNumaNode
|
||||
var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted,
|
||||
topologymanager.PolicyBestEffort, topologymanager.PolicyNone}
|
||||
|
||||
// Configure Topology Manager
|
||||
ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy))
|
||||
framework.Logf("Configuring topology Manager policy to %s", policy)
|
||||
for _, policy := range policies {
|
||||
// Configure Topology Manager
|
||||
ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy))
|
||||
framework.Logf("Configuring topology Manager policy to %s", policy)
|
||||
|
||||
reservedSystemCPUs := configureTopologyManagerInKubelet(f, oldCfg, policy, configMap, numaNodes)
|
||||
reservedSystemCPUs := configureTopologyManagerInKubelet(f, oldCfg, policy, configMap, numaNodes)
|
||||
|
||||
runTopologyManagerNodeAlignmentSuiteTests(f, configMap, reservedSystemCPUs, numaNodes, coreCount)
|
||||
runTopologyManagerNodeAlignmentSuiteTests(f, configMap, reservedSystemCPUs, numaNodes, coreCount, policy)
|
||||
}
|
||||
|
||||
// restore kubelet config
|
||||
setOldKubeletConfig(f, oldCfg)
|
||||
|
Loading…
Reference in New Issue
Block a user