diff --git a/hack/generate-bindata.sh b/hack/generate-bindata.sh index f8cb2768ff2..25cd5d75d96 100755 --- a/hack/generate-bindata.sh +++ b/hack/generate-bindata.sh @@ -47,6 +47,7 @@ BINDATA_OUTPUT="test/e2e/generated/bindata.go" go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \ -ignore .jpg -ignore .png -ignore .md -ignore 'BUILD(\.bazel)?' \ "test/e2e/testing-manifests/..." \ + "test/e2e_node/testing-manifests/..." \ "test/images/..." \ "test/fixtures/..." diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 4d839f0ebe5..233bb6da7d5 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -1878,7 +1878,6 @@ func DumpDebugInfo(c clientset.Interface, ns string) { // DsFromManifest reads a .json/yaml file and returns the daemonset in it. func DsFromManifest(url string) (*appsv1.DaemonSet, error) { - var ds appsv1.DaemonSet Logf("Parsing ds from %v", url) var response *http.Response @@ -1904,7 +1903,12 @@ func DsFromManifest(url string) (*appsv1.DaemonSet, error) { if err != nil { return nil, fmt.Errorf("Failed to read html response body: %v", err) } + return DsFromData(data) +} +// DsFromData reads a byte slice and returns the daemonset in it. +func DsFromData(data []byte) (*appsv1.DaemonSet, error) { + var ds appsv1.DaemonSet dataJSON, err := utilyaml.ToJSON(data) if err != nil { return nil, fmt.Errorf("Failed to parse data to json: %v", err) diff --git a/test/e2e/generated/BUILD b/test/e2e/generated/BUILD index c5d8f822a81..de04cfe7b5e 100644 --- a/test/e2e/generated/BUILD +++ b/test/e2e/generated/BUILD @@ -24,6 +24,7 @@ go_bindata( name = "bindata", srcs = [ "//test/e2e/testing-manifests:all-srcs", + "//test/e2e_node/testing-manifests:all-srcs", "//test/fixtures:all-srcs", "//test/images:all-srcs", ], diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 29d740a2a97..a59aeeeff7f 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -17,6 +17,7 @@ go_library( "node_problem_detector_linux.go", "resource_collector.go", "util.go", + "util_sriov.go", "util_xfs_linux.go", "util_xfs_unsupported.go", ], @@ -49,6 +50,7 @@ go_library( "//test/e2e/framework/gpu:go_default_library", "//test/e2e/framework/metrics:go_default_library", "//test/e2e/framework/node:go_default_library", + "//test/e2e/framework/testfiles:go_default_library", "//test/utils/image:go_default_library", "//vendor/github.com/blang/semver:go_default_library", "//vendor/github.com/coreos/go-systemd/util:go_default_library", @@ -266,6 +268,7 @@ filegroup( "//test/e2e_node/runner/remote:all-srcs", "//test/e2e_node/services:all-srcs", "//test/e2e_node/system:all-srcs", + "//test/e2e_node/testing-manifests:all-srcs", ], tags = ["automanaged"], visibility = ["//visibility:public"], diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go index 2aef9388a27..e896ae36522 100644 --- a/test/e2e_node/image_list.go +++ b/test/e2e_node/image_list.go @@ -31,6 +31,7 @@ import ( commontest "k8s.io/kubernetes/test/e2e/common" "k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/e2e/framework/gpu" + "k8s.io/kubernetes/test/e2e/framework/testfiles" imageutils "k8s.io/kubernetes/test/utils/image" ) @@ -68,6 +69,7 @@ func updateImageWhiteList() { framework.ImageWhiteList = NodeImageWhiteList.Union(commontest.CommonImageWhiteList) // Images from extra envs framework.ImageWhiteList.Insert(getNodeProblemDetectorImage()) + framework.ImageWhiteList.Insert(getSRIOVDevicePluginImage()) } func getNodeProblemDetectorImage() string { @@ -184,3 +186,26 @@ func getGPUDevicePluginImage() string { } return ds.Spec.Template.Spec.Containers[0].Image } + +// getSRIOVDevicePluginImage returns the image of SRIOV device plugin. +func getSRIOVDevicePluginImage() string { + data, err := testfiles.Read(SRIOVDevicePluginDSYAML) + if err != nil { + klog.Errorf("Failed to read the device plugin manifest: %v", err) + return "" + } + ds, err := framework.DsFromData(data) + if err != nil { + klog.Errorf("Failed to parse the device plugin image: %v", err) + return "" + } + if ds == nil { + klog.Errorf("Failed to parse the device plugin image: the extracted DaemonSet is nil") + return "" + } + if len(ds.Spec.Template.Spec.Containers) < 1 { + klog.Errorf("Failed to parse the device plugin image: cannot extract the container from YAML") + return "" + } + return ds.Spec.Template.Spec.Containers[0].Image +} diff --git a/test/e2e_node/testing-manifests/BUILD b/test/e2e_node/testing-manifests/BUILD new file mode 100644 index 00000000000..7e76248ad95 --- /dev/null +++ b/test/e2e_node/testing-manifests/BUILD @@ -0,0 +1,14 @@ +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) diff --git a/test/e2e_node/testing-manifests/sriovdp-cm.yaml b/test/e2e_node/testing-manifests/sriovdp-cm.yaml new file mode 100644 index 00000000000..373d759767c --- /dev/null +++ b/test/e2e_node/testing-manifests/sriovdp-cm.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: sriovdp-config + namespace: kube-system +data: + config.json: | + { + "resourceList": [{ + "resourceName": "intel_sriov_netdevice", + "selectors": { + "vendors": ["8086"], + "devices": ["154c", "10ed", "1521"], + "drivers": ["i40evf", "ixgbevf", "igb"] + } + }, + { + "resourceName": "intel_sriov_dpdk", + "selectors": { + "vendors": ["8086"], + "devices": ["154c", "10ed"], + "drivers": ["vfio-pci"], + "pfNames": ["enp0s0f0","enp2s2f1"] + } + }, + { + "resourceName": "mlnx_sriov_rdma", + "isRdma": true, + "selectors": { + "vendors": ["15b3"], + "devices": ["1018"], + "drivers": ["mlx5_ib"] + } + } + ] + } diff --git a/test/e2e_node/testing-manifests/sriovdp-ds.yaml b/test/e2e_node/testing-manifests/sriovdp-ds.yaml new file mode 100644 index 00000000000..30f76ff470b --- /dev/null +++ b/test/e2e_node/testing-manifests/sriovdp-ds.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kube-sriov-device-plugin-amd64 + namespace: kube-system + labels: + tier: node + app: sriovdp +spec: + selector: + matchLabels: + name: sriov-device-plugin + template: + metadata: + labels: + name: sriov-device-plugin + tier: node + app: sriovdp + spec: + hostNetwork: true + hostPID: true + nodeSelector: + beta.kubernetes.io/arch: amd64 + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + serviceAccountName: sriov-device-plugin + containers: + - name: kube-sriovdp + image: docker.io/nfvpe/sriov-device-plugin:v3.1 + imagePullPolicy: Never + args: + - --log-dir=sriovdp + - --log-level=10 + securityContext: + privileged: true + volumeMounts: + - name: devicesock + mountPath: /var/lib/kubelet/ + readOnly: false + - name: log + mountPath: /var/log + - name: config-volume + mountPath: /etc/pcidp + volumes: + - name: devicesock + hostPath: + path: /var/lib/kubelet/ + - name: log + hostPath: + path: /var/log + - name: config-volume + configMap: + name: sriovdp-config + items: + - key: config.json + path: config.json diff --git a/test/e2e_node/testing-manifests/sriovdp-sa.yaml b/test/e2e_node/testing-manifests/sriovdp-sa.yaml new file mode 100644 index 00000000000..73bf1199ee2 --- /dev/null +++ b/test/e2e_node/testing-manifests/sriovdp-sa.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sriov-device-plugin + namespace: kube-system diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go index afb1a6da383..3db897f4a8b 100644 --- a/test/e2e_node/topology_manager_test.go +++ b/test/e2e_node/topology_manager_test.go @@ -17,10 +17,16 @@ limitations under the License. package e2enode import ( + "context" "fmt" + "os/exec" + "strconv" + "strings" "time" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" @@ -29,7 +35,9 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + "k8s.io/kubernetes/test/e2e/framework/testfiles" "github.com/onsi/ginkgo" "github.com/onsi/gomega" @@ -37,16 +45,52 @@ import ( // Helper for makeTopologyManagerPod(). type tmCtnAttribute struct { - ctnName string - cpuRequest string - cpuLimit string + ctnName string + cpuRequest string + cpuLimit string + devResource string +} + +func detectNUMANodes() int { + outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"NUMA node(s):\" | cut -d \":\" -f 2").Output() + framework.ExpectNoError(err) + + numaNodes, err := strconv.Atoi(strings.TrimSpace(string(outData))) + framework.ExpectNoError(err) + + return numaNodes +} + +// TODO: what about HT? +func detectCoresPerSocket() int { + outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"Core(s) per socket:\" | cut -d \":\" -f 2").Output() + framework.ExpectNoError(err) + + coreCount, err := strconv.Atoi(strings.TrimSpace(string(outData))) + framework.ExpectNoError(err) + + return coreCount +} + +func detectSRIOVDevices() int { + outData, err := exec.Command("/bin/sh", "-c", "ls /sys/bus/pci/devices/*/sriov_totalvfs | wc -w").Output() + framework.ExpectNoError(err) + + devCount, err := strconv.Atoi(strings.TrimSpace(string(outData))) + framework.ExpectNoError(err) + + return devCount } // makeTopologyMangerPod returns a pod with the provided tmCtnAttributes. func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v1.Pod { + cpusetCmd := "grep Cpus_allowed_list /proc/self/status | cut -f2 && sleep 1d" + return makeTopologyManagerTestPod(podName, cpusetCmd, tmCtnAttributes) +} + +func makeTopologyManagerTestPod(podName, podCmd string, tmCtnAttributes []tmCtnAttribute) *v1.Pod { var containers []v1.Container for _, ctnAttr := range tmCtnAttributes { - cpusetCmd := fmt.Sprintf("grep Cpus_allowed_list /proc/self/status | cut -f2 && sleep 1d") ctn := v1.Container{ Name: ctnAttr.ctnName, Image: busyboxImage, @@ -60,7 +104,11 @@ func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"), }, }, - Command: []string{"sh", "-c", cpusetCmd}, + Command: []string{"sh", "-c", podCmd}, + } + if ctnAttr.devResource != "" { + ctn.Resources.Requests[v1.ResourceName(ctnAttr.devResource)] = resource.MustParse("1") + ctn.Resources.Limits[v1.ResourceName(ctnAttr.devResource)] = resource.MustParse("1") } containers = append(containers, ctn) } @@ -121,7 +169,60 @@ func configureTopologyManagerInKubelet(f *framework.Framework, oldCfg *kubeletco }, time.Minute, time.Second).Should(gomega.BeTrue()) } -func runTopologyManagerSuiteTests(f *framework.Framework) { +// getSRIOVDevicePluginPod returns the Device Plugin pod for sriov resources in e2e tests. +func getSRIOVDevicePluginPod() *v1.Pod { + ds := readDaemonSetV1OrDie(testfiles.ReadOrDie(SRIOVDevicePluginDSYAML)) + p := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: SRIOVDevicePluginName, + Namespace: metav1.NamespaceSystem, + }, + + Spec: ds.Spec.Template.Spec, + } + + return p +} + +func readConfigMapV1OrDie(objBytes []byte) *v1.ConfigMap { + v1.AddToScheme(appsScheme) + requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes) + if err != nil { + panic(err) + } + return requiredObj.(*v1.ConfigMap) +} + +func readServiceAccountV1OrDie(objBytes []byte) *v1.ServiceAccount { + v1.AddToScheme(appsScheme) + requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes) + if err != nil { + panic(err) + } + return requiredObj.(*v1.ServiceAccount) +} + +// numberOfResources returns the number of resources advertised by a node. +func numberOfResources(node *v1.Node, resourceKey string) int64 { + val, ok := node.Status.Capacity[v1.ResourceName(resourceKey)] + + if !ok { + return 0 + } + + return val.Value() +} + +func deletePodInNamespace(f *framework.Framework, namespace, name string) { + gp := int64(0) + deleteOptions := metav1.DeleteOptions{ + GracePeriodSeconds: &gp, + } + err := f.ClientSet.CoreV1().Pods(namespace).Delete(context.TODO(), name, &deleteOptions) + framework.ExpectNoError(err) +} + +func runTopologyManagerPolicySuiteTests(f *framework.Framework) { var cpuCap, cpuAlloc int64 var cpuListString, expAllowedCPUsListRegex string var cpuList []int @@ -347,9 +448,72 @@ func runTopologyManagerSuiteTests(f *framework.Framework) { waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace) } +func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework) { + var err error + + configMap := readConfigMapV1OrDie(testfiles.ReadOrDie(SRIOVDevicePluginCMYAML)) + ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name)) + if _, err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Create(context.TODO(), configMap, metav1.CreateOptions{}); err != nil { + framework.Failf("unable to create test configMap %s: %v", configMap.Name, err) + } + + serviceAccount := readServiceAccountV1OrDie(testfiles.ReadOrDie(SRIOVDevicePluginSAYAML)) + ginkgo.By(fmt.Sprintf("Creating serviceAccount %v/%v", metav1.NamespaceSystem, serviceAccount.Name)) + if _, err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Create(context.TODO(), serviceAccount, metav1.CreateOptions{}); err != nil { + framework.Failf("unable to create test serviceAccount %s: %v", serviceAccount.Name, err) + } + + e2enode.WaitForNodeToBeReady(f.ClientSet, framework.TestContext.NodeName, 5*time.Minute) + + dp := getSRIOVDevicePluginPod() + dp.Spec.NodeName = framework.TestContext.NodeName + + ginkgo.By("Create SRIOV device plugin pod") + dpPod, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Waiting for devices to become available on the local node") + gomega.Eventually(func() bool { + node := getLocalNode(f) + framework.Logf("Node status: %v", node.Status.Capacity) + return numberOfResources(node, SRIOVResourceName) > 0 + }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) + framework.Logf("Successfully created device plugin pod") + + ginkgo.By("running a Gu pod") + ctnAttrs := []tmCtnAttribute{ + { + ctnName: "gu-container", + cpuRequest: "1000m", + cpuLimit: "1000m", + devResource: SRIOVResourceName, + }, + } + + pod := makeTopologyManagerTestPod("gu-pod", "env && sleep 1d", ctnAttrs) + pod = f.PodClient().CreateSync(pod) + + ginkgo.By("validating the Gu pod") + _, err = e2epod.GetPodLogs(f.ClientSet, f.Namespace.Name, pod.Name, pod.Spec.Containers[0].Name) + framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", + pod.Spec.Containers[0].Name, pod.Name) + + ginkgo.By("by deleting the pods and waiting for container removal") + deletePods(f, []string{pod.Name}) + waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace) + + framework.Logf("deleting the SRIOV device plugin pod %s/%s and waiting for container %s removal", + dpPod.Namespace, dpPod.Name, dpPod.Spec.Containers[0].Name) + deletePodInNamespace(f, dpPod.Namespace, dpPod.Name) + waitForContainerRemoval(dpPod.Spec.Containers[0].Name, dpPod.Name, dpPod.Namespace) +} + func runTopologyManagerTests(f *framework.Framework) { - ginkgo.It("run Topology Manager test suite", func() { - oldCfg, err := getCurrentKubeletConfig() + var oldCfg *kubeletconfig.KubeletConfiguration + var err error + + ginkgo.It("run Topology Manager policy test suite", func() { + oldCfg, err = getCurrentKubeletConfig() framework.ExpectNoError(err) var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted, @@ -362,13 +526,45 @@ func runTopologyManagerTests(f *framework.Framework) { configureTopologyManagerInKubelet(f, oldCfg, policy) // Run the tests - runTopologyManagerSuiteTests(f) + runTopologyManagerPolicySuiteTests(f) } // restore kubelet config setOldKubeletConfig(f, oldCfg) - // Debug sleep to allow time to look at kubelet config - time.Sleep(5 * time.Minute) + // Delete state file to allow repeated runs + deleteStateFile() + }) + + ginkgo.It("run Topology Manager node alignment test suite", func() { + numaNodes := detectNUMANodes() + coreCount := detectCoresPerSocket() + sriovdevCount := detectSRIOVDevices() + + if numaNodes < 2 { + e2eskipper.Skipf("this test is meant to run on a multi-node NUMA system") + } + if coreCount < 4 { + e2eskipper.Skipf("this test is meant to run on a system with at least 4 cores per socket") + } + if sriovdevCount == 0 { + e2eskipper.Skipf("this test is meant to run on a system with at least one SRIOV device") + } + + oldCfg, err = getCurrentKubeletConfig() + framework.ExpectNoError(err) + + policy := topologymanager.PolicySingleNumaNode + + // Configure Topology Manager + ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy)) + framework.Logf("Configuring topology Manager policy to %s", policy) + + configureTopologyManagerInKubelet(f, oldCfg, policy) + + runTopologyManagerNodeAlignmentSuiteTests(f) + + // restore kubelet config + setOldKubeletConfig(f, oldCfg) // Delete state file to allow repeated runs deleteStateFile() diff --git a/test/e2e_node/util_sriov.go b/test/e2e_node/util_sriov.go new file mode 100644 index 00000000000..f985b11d94a --- /dev/null +++ b/test/e2e_node/util_sriov.go @@ -0,0 +1,30 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +const ( + // SRIOVResourceName is the name of the example resource which is used in the e2e test + SRIOVResourceName = "intel.com/intel_sriov_netdevice" // TODO make it configurable + // SRIOVDevicePluginCMYAML is the path of the config map to configure the sriov device plugin. + SRIOVDevicePluginCMYAML = "test/e2e_node/testing-manifests/sriovdp-cm.yaml" + // SRIOVDevicePluginDSYAML is the path of the daemonset template of the sriov device plugin. // TODO: Parametrize it by making it a feature in TestFramework. + SRIOVDevicePluginDSYAML = "test/e2e_node/testing-manifests/sriovdp-ds.yaml" + // SRIOVDevicePluginSAYAML is the path of the service account needed by the sriov device plugin to run. + SRIOVDevicePluginSAYAML = "test/e2e_node/testing-manifests/sriovdp-sa.yaml" + // SRIOVDevicePluginName is the name of the device plugin pod + SRIOVDevicePluginName = "sriov-device-plugin" +)