diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index a59aeeeff7f..88e895289d3 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -15,6 +15,7 @@ go_library( "framework.go", "image_list.go", "node_problem_detector_linux.go", + "numa_alignment.go", "resource_collector.go", "util.go", "util_sriov.go", @@ -31,6 +32,7 @@ go_library( "//pkg/kubelet/apis/podresources/v1alpha1:go_default_library", "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", "//pkg/kubelet/cm:go_default_library", + "//pkg/kubelet/cm/cpuset:go_default_library", "//pkg/kubelet/kubeletconfig/util/codec:go_default_library", "//pkg/kubelet/metrics:go_default_library", "//pkg/kubelet/remote:go_default_library", diff --git a/test/e2e_node/numa_alignment.go b/test/e2e_node/numa_alignment.go new file mode 100644 index 00000000000..d44a58036d3 --- /dev/null +++ b/test/e2e_node/numa_alignment.go @@ -0,0 +1,199 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "fmt" + "sort" + "strconv" + "strings" + + v1 "k8s.io/api/core/v1" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + + "k8s.io/kubernetes/test/e2e/framework" +) + +type numaPodResources struct { + CPUToNUMANode map[int]int + PCIDevsToNUMANode map[string]int +} + +func (R *numaPodResources) CheckAlignment() bool { + nodeNum := -1 // not set + for _, cpuNode := range R.CPUToNUMANode { + if nodeNum == -1 { + nodeNum = cpuNode + } else if nodeNum != cpuNode { + return false + } + } + for _, devNode := range R.PCIDevsToNUMANode { + // TODO: explain -1 + if devNode != -1 && nodeNum != devNode { + return false + } + } + return true +} + +func (R *numaPodResources) String() string { + var b strings.Builder + // To store the keys in slice in sorted order + var cpuKeys []int + for ck := range R.CPUToNUMANode { + cpuKeys = append(cpuKeys, ck) + } + sort.Ints(cpuKeys) + for _, k := range cpuKeys { + nodeNum := R.CPUToNUMANode[k] + b.WriteString(fmt.Sprintf("CPU cpu#%03d=%02d\n", k, nodeNum)) + } + var pciKeys []string + for pk := range R.PCIDevsToNUMANode { + pciKeys = append(pciKeys, pk) + } + sort.Strings(pciKeys) + for _, k := range pciKeys { + nodeNum := R.PCIDevsToNUMANode[k] + b.WriteString(fmt.Sprintf("PCI %s=%02d\n", k, nodeNum)) + } + return b.String() +} + +func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string, numaNodes int) (map[int]int, error) { + var cpuIDs []int + cpuListAllowedEnvVar := "CPULIST_ALLOWED" + + for name, value := range environ { + if name == cpuListAllowedEnvVar { + cpus, err := cpuset.Parse(value) + if err != nil { + return nil, err + } + cpuIDs = cpus.ToSlice() + } + } + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("variable %q found in environ", cpuListAllowedEnvVar) + } + + cpusPerNUMA := make(map[int][]int) + for numaNode := 0; numaNode < numaNodes; numaNode++ { + nodeCPUList := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name, + "/bin/cat", fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", numaNode)) + + cpus, err := cpuset.Parse(nodeCPUList) + if err != nil { + return nil, err + } + cpusPerNUMA[numaNode] = cpus.ToSlice() + } + + // CPU IDs -> NUMA Node ID + CPUToNUMANode := make(map[int]int) + for nodeNum, cpus := range cpusPerNUMA { + for _, cpu := range cpus { + CPUToNUMANode[cpu] = nodeNum + } + } + + // filter out only the allowed CPUs + CPUMap := make(map[int]int) + for _, cpuID := range cpuIDs { + _, ok := CPUToNUMANode[cpuID] + if !ok { + return nil, fmt.Errorf("CPU %d not found on NUMA map: %v", cpuID, CPUToNUMANode) + } + CPUMap[cpuID] = CPUToNUMANode[cpuID] + } + return CPUMap, nil +} + +func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, environ map[string]string) (map[string]int, error) { + pciDevPrefix := "PCIDEVICE_" + // at this point we don't care which plugin selected the device, + // we only need to know which devices were assigned to the POD. + // Hence, do prefix search for the variable and fetch the device(s). + + NUMAPerDev := make(map[string]int) + for name, value := range environ { + if !strings.HasPrefix(name, pciDevPrefix) { + continue + } + + // a single plugin can allocate more than a single device + pciDevs := strings.Split(value, ",") + for _, pciDev := range pciDevs { + pciDevNUMANode := f.ExecCommandInContainer(pod.Name, pod.Spec.Containers[0].Name, + "/bin/cat", fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", pciDev)) + + nodeNum, err := strconv.Atoi(pciDevNUMANode) + if err != nil { + return nil, err + } + NUMAPerDev[pciDev] = nodeNum + } + } + if len(NUMAPerDev) == 0 { + return nil, fmt.Errorf("no PCI devices found in environ") + } + return NUMAPerDev, nil +} + +func makeEnvMap(logs string) (map[string]string, error) { + podEnv := strings.Split(logs, "\n") + envMap := make(map[string]string) + for _, envVar := range podEnv { + if len(envVar) == 0 { + continue + } + pair := strings.SplitN(envVar, "=", 2) + if len(pair) != 2 { + return nil, fmt.Errorf("unable to split %q", envVar) + } + envMap[pair[0]] = pair[1] + } + return envMap, nil +} + +func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, logs string, numaNodes int) (numaPodResources, error) { + podEnv, err := makeEnvMap(logs) + if err != nil { + return numaPodResources{}, err + } + + CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, podEnv, numaNodes) + if err != nil { + return numaPodResources{}, err + } + + PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, podEnv) + if err != nil { + return numaPodResources{}, err + } + + numaRes := numaPodResources{ + CPUToNUMANode: CPUToNUMANode, + PCIDevsToNUMANode: PCIDevsToNUMANode, + } + aligned := numaRes.CheckAlignment() + if !aligned { + return numaRes, fmt.Errorf("NUMA resources not aligned") + } + return numaRes, nil +} diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go index 3db897f4a8b..833002fe460 100644 --- a/test/e2e_node/topology_manager_test.go +++ b/test/e2e_node/topology_manager_test.go @@ -43,6 +43,10 @@ import ( "github.com/onsi/gomega" ) +const ( + numalignCmd = `export CPULIST_ALLOWED=$( awk -F":\t*" '/Cpus_allowed_list/ { print $2 }' /proc/self/status); env; sleep 1d` +) + // Helper for makeTopologyManagerPod(). type tmCtnAttribute struct { ctnName string @@ -222,6 +226,18 @@ func deletePodInNamespace(f *framework.Framework, namespace, name string) { framework.ExpectNoError(err) } +func validatePodAlignment(f *framework.Framework, pod *v1.Pod, numaNodes int) { + ginkgo.By("validating the Gu pod") + logs, err := e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name) + framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", + pod.Spec.Containers[0].Name, pod.Name) + + framework.Logf("got pod logs: %v", logs) + numaRes, err := checkNUMAAlignment(f, pod, logs, numaNodes) + framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]: %s", + pod.Spec.Containers[0].Name, pod.Name, numaRes.String()) +} + func runTopologyManagerPolicySuiteTests(f *framework.Framework) { var cpuCap, cpuAlloc int64 var cpuListString, expAllowedCPUsListRegex string @@ -448,7 +464,29 @@ func runTopologyManagerPolicySuiteTests(f *framework.Framework) { waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace) } -func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework) { +func runTopologyManagerNodeAlignmentSinglePodTest(f *framework.Framework, numaNodes int) { + ginkgo.By("allocate aligned resources for a single pod") + ctnAttrs := []tmCtnAttribute{ + { + ctnName: "gu-container", + cpuRequest: "1000m", + cpuLimit: "1000m", + devResource: SRIOVResourceName, + }, + } + + pod := makeTopologyManagerTestPod("gu-pod", numalignCmd, ctnAttrs) + pod = f.PodClient().CreateSync(pod) + + validatePodAlignment(f, pod, numaNodes) + + framework.Logf("deleting the pod %s/%s and waiting for container %s removal", + pod.Namespace, pod.Name, pod.Spec.Containers[0].Name) + deletePods(f, []string{pod.Name}) + waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace) +} + +func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, numaNodes int) { var err error configMap := readConfigMapV1OrDie(testfiles.ReadOrDie(SRIOVDevicePluginCMYAML)) @@ -480,27 +518,7 @@ func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework) { }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) framework.Logf("Successfully created device plugin pod") - ginkgo.By("running a Gu pod") - ctnAttrs := []tmCtnAttribute{ - { - ctnName: "gu-container", - cpuRequest: "1000m", - cpuLimit: "1000m", - devResource: SRIOVResourceName, - }, - } - - pod := makeTopologyManagerTestPod("gu-pod", "env && sleep 1d", ctnAttrs) - pod = f.PodClient().CreateSync(pod) - - ginkgo.By("validating the Gu pod") - _, err = e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name) - framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", - pod.Spec.Containers[0].Name, pod.Name) - - ginkgo.By("by deleting the pods and waiting for container removal") - deletePods(f, []string{pod.Name}) - waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace) + runTopologyManagerNodeAlignmentSinglePodTest(f, numaNodes) framework.Logf("deleting the SRIOV device plugin pod %s/%s and waiting for container %s removal", dpPod.Namespace, dpPod.Name, dpPod.Spec.Containers[0].Name) @@ -561,7 +579,7 @@ func runTopologyManagerTests(f *framework.Framework) { configureTopologyManagerInKubelet(f, oldCfg, policy) - runTopologyManagerNodeAlignmentSuiteTests(f) + runTopologyManagerNodeAlignmentSuiteTests(f, numaNodes) // restore kubelet config setOldKubeletConfig(f, oldCfg)