From 337fdf2f37126fea750637dd8c0a96eef9ba8ee5 Mon Sep 17 00:00:00 2001
From: vpickard <vpickard@redhat.com>
Date: Tue, 10 Dec 2019 10:35:39 -0500
Subject: [PATCH] [WIP] e2e-topology-manager: Initial commit for E2E tests

This is the initial commit for E2E testing for Topology
Manager.

For now, run a subset of the CPU Manager tests.

Additional tests will be forthcoming.

Signed-off-by: vpickard <vpickard@redhat.com>
---
 test/e2e_node/BUILD                    |   1 +
 test/e2e_node/topology_manager_test.go | 387 +++++++++++++++++++++++++
 2 files changed, 388 insertions(+)
 create mode 100644 test/e2e_node/topology_manager_test.go

diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD
index bab6ada0410..de161344f69 100644
--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@@ -110,6 +110,7 @@ go_test(
         "container_log_rotation_test.go",
         "container_manager_test.go",
         "cpu_manager_test.go",
+        "topology_manager_test.go",
         "critical_pod_test.go",
         "density_test.go",
         "device_plugin_test.go",
diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go
new file mode 100644
index 00000000000..ac28dd6f499
--- /dev/null
+++ b/test/e2e_node/topology_manager_test.go
@@ -0,0 +1,387 @@
+/*
+Copyright 2019 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e_node
+
+import (
+	"fmt"
+	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
+	"time"
+
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
+	"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
+	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
+	"k8s.io/kubernetes/test/e2e/framework"
+	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
+
+	"github.com/onsi/ginkgo"
+	"github.com/onsi/gomega"
+)
+
+// Helper for makeTopologyManagerPod().
+type tmCtnAttribute struct {
+	ctnName    string
+	cpuRequest string
+	cpuLimit   string
+}
+
+// makeTopologyMangerPod returns a pod with the provided tmCtnAttributes.
+func makeTopologyManagerPod(podName string, tmCtnAttributes []tmCtnAttribute) *v1.Pod {
+	var containers []v1.Container
+	for _, ctnAttr := range tmCtnAttributes {
+		cpusetCmd := fmt.Sprintf("grep Cpus_allowed_list /proc/self/status | cut -f2 && sleep 1d")
+		ctn := v1.Container{
+			Name:  ctnAttr.ctnName,
+			Image: busyboxImage,
+			Resources: v1.ResourceRequirements{
+				Requests: v1.ResourceList{
+					v1.ResourceName(v1.ResourceCPU):    resource.MustParse(ctnAttr.cpuRequest),
+					v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
+				},
+				Limits: v1.ResourceList{
+					v1.ResourceName(v1.ResourceCPU):    resource.MustParse(ctnAttr.cpuLimit),
+					v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
+				},
+			},
+			Command: []string{"sh", "-c", cpusetCmd},
+		}
+		containers = append(containers, ctn)
+	}
+
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: podName,
+		},
+		Spec: v1.PodSpec{
+			RestartPolicy: v1.RestartPolicyNever,
+			Containers:    containers,
+		},
+	}
+}
+
+func configureTopologyManagerInKubelet(f *framework.Framework, policy string) {
+	// Configure Topology Manager in Kubelet with policy.
+	oldCfg, err := getCurrentKubeletConfig()
+	framework.ExpectNoError(err)
+	newCfg := oldCfg.DeepCopy()
+	if newCfg.FeatureGates == nil {
+		newCfg.FeatureGates = make(map[string]bool)
+	}
+
+	newCfg.FeatureGates["CPUManager"] = true
+	newCfg.FeatureGates["TopologyManager"] = true
+
+	deleteStateFile()
+
+	// Set the Topology Manager policy
+	newCfg.TopologyManagerPolicy = policy
+	//newCfg.TopologyManagerPolicy = topologymanager.PolicySingleNumaNode
+
+	// Set the CPU Manager policy to static.
+	newCfg.CPUManagerPolicy = string(cpumanager.PolicyStatic)
+
+	// Set the CPU Manager reconcile period to 1 second.
+	newCfg.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second}
+
+	// The Kubelet panics if either kube-reserved or system-reserved is not set
+	// when CPU Manager is enabled. Set cpu in kube-reserved > 0 so that
+	// kubelet doesn't panic.
+	if newCfg.KubeReserved == nil {
+		newCfg.KubeReserved = map[string]string{}
+	}
+
+	if _, ok := newCfg.KubeReserved["cpu"]; !ok {
+		newCfg.KubeReserved["cpu"] = "200m"
+	}
+	// Dump the config -- debug
+	framework.Logf( "New kublet config is %s", *newCfg)
+
+	// Update the Kubelet configuration.
+	framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
+
+	// Wait for the Kubelet to be ready.
+	gomega.Eventually(func() bool {
+		nodes, err := e2enode.TotalReady(f.ClientSet)
+		framework.ExpectNoError(err)
+		return nodes == 1
+	}, time.Minute, time.Second).Should(gomega.BeTrue())
+}
+
+func runTopologyManagerSuiteTests(f *framework.Framework) {
+	var cpuCap, cpuAlloc int64
+	var cpuListString, expAllowedCPUsListRegex string
+	var cpuList []int
+	var cpu1, cpu2 int
+	var cset cpuset.CPUSet
+	var err error
+	var ctnAttrs []tmCtnAttribute
+	var pod, pod1, pod2 *v1.Pod
+
+	cpuCap, cpuAlloc, _ = getLocalNodeCPUDetails(f)
+
+	ginkgo.By("running a non-Gu pod")
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "non-gu-container",
+			cpuRequest: "100m",
+			cpuLimit:   "200m",
+		},
+	}
+	pod = makeTopologyManagerPod("non-gu-pod", ctnAttrs)
+	pod = f.PodClient().CreateSync(pod)
+
+	ginkgo.By("checking if the expected cpuset was assigned")
+	expAllowedCPUsListRegex = fmt.Sprintf("^0-%d\n$", cpuCap-1)
+	err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod.Spec.Containers[0].Name, pod.Name)
+
+	ginkgo.By("by deleting the pods and waiting for container removal")
+	deletePods(f, []string{pod.Name})
+	waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
+
+	ginkgo.By("running a Gu pod")
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "gu-container",
+			cpuRequest: "1000m",
+			cpuLimit:   "1000m",
+		},
+	}
+	pod = makeTopologyManagerPod("gu-pod", ctnAttrs)
+	pod = f.PodClient().CreateSync(pod)
+
+	ginkgo.By("checking if the expected cpuset was assigned")
+	cpu1 = 1
+	if isHTEnabled() {
+		cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
+		cpu1 = cpuList[1]
+	}
+	expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu1)
+	err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod.Spec.Containers[0].Name, pod.Name)
+
+	ginkgo.By("by deleting the pods and waiting for container removal")
+	deletePods(f, []string{pod.Name})
+	waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
+
+	ginkgo.By("running multiple Gu and non-Gu pods")
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "gu-container",
+			cpuRequest: "1000m",
+			cpuLimit:   "1000m",
+		},
+	}
+	pod1 = makeTopologyManagerPod("gu-pod", ctnAttrs)
+	pod1 = f.PodClient().CreateSync(pod1)
+
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "non-gu-container",
+			cpuRequest: "200m",
+			cpuLimit:   "300m",
+		},
+	}
+	pod2 = makeTopologyManagerPod("non-gu-pod", ctnAttrs)
+	pod2 = f.PodClient().CreateSync(pod2)
+
+	ginkgo.By("checking if the expected cpuset was assigned")
+	cpu1 = 1
+	if isHTEnabled() {
+		cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
+		cpu1 = cpuList[1]
+	}
+	expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu1)
+	err = f.PodClient().MatchContainerOutput(pod1.Name, pod1.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod1.Spec.Containers[0].Name, pod1.Name)
+
+	cpuListString = "0"
+	if cpuAlloc > 2 {
+		cset = cpuset.MustParse(fmt.Sprintf("0-%d", cpuCap-1))
+		cpuListString = fmt.Sprintf("%s", cset.Difference(cpuset.NewCPUSet(cpu1)))
+	}
+	expAllowedCPUsListRegex = fmt.Sprintf("^%s\n$", cpuListString)
+	err = f.PodClient().MatchContainerOutput(pod2.Name, pod2.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod2.Spec.Containers[0].Name, pod2.Name)
+
+	ginkgo.By("by deleting the pods and waiting for container removal")
+	deletePods(f, []string{pod1.Name, pod2.Name})
+	waitForContainerRemoval(pod1.Spec.Containers[0].Name, pod1.Name, pod1.Namespace)
+	waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
+
+	// Skip rest of the tests if CPU capacity < 3.
+	if cpuCap < 3 {
+		framework.Skipf("Skipping rest of the CPU Manager tests since CPU capacity < 3")
+	}
+
+	ginkgo.By("running a Gu pod requesting multiple CPUs")
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "gu-container",
+			cpuRequest: "2000m",
+			cpuLimit:   "2000m",
+		},
+	}
+	pod = makeTopologyManagerPod("gu-pod", ctnAttrs)
+	pod = f.PodClient().CreateSync(pod)
+
+	ginkgo.By("checking if the expected cpuset was assigned")
+	cpuListString = "1-2"
+	if isHTEnabled() {
+		cpuListString = "2-3"
+		cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
+		if cpuList[1] != 1 {
+			cset = cpuset.MustParse(getCPUSiblingList(1))
+			cpuListString = fmt.Sprintf("%s", cset)
+		}
+	}
+	expAllowedCPUsListRegex = fmt.Sprintf("^%s\n$", cpuListString)
+	err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod.Spec.Containers[0].Name, pod.Name)
+
+	ginkgo.By("by deleting the pods and waiting for container removal")
+	deletePods(f, []string{pod.Name})
+	waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
+
+	ginkgo.By("running a Gu pod with multiple containers requesting integer CPUs")
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "gu-container1",
+			cpuRequest: "1000m",
+			cpuLimit:   "1000m",
+		},
+		{
+			ctnName:    "gu-container2",
+			cpuRequest: "1000m",
+			cpuLimit:   "1000m",
+		},
+	}
+	pod = makeTopologyManagerPod("gu-pod", ctnAttrs)
+	pod = f.PodClient().CreateSync(pod)
+
+	ginkgo.By("checking if the expected cpuset was assigned")
+	cpu1, cpu2 = 1, 2
+	if isHTEnabled() {
+		cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
+		if cpuList[1] != 1 {
+			cpu1, cpu2 = cpuList[1], 1
+		}
+	}
+
+	expAllowedCPUsListRegex = fmt.Sprintf("^%d|%d\n$", cpu1, cpu2)
+	err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod.Spec.Containers[0].Name, pod.Name)
+
+	err = f.PodClient().MatchContainerOutput(pod.Name, pod.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod.Spec.Containers[1].Name, pod.Name)
+
+	ginkgo.By("by deleting the pods and waiting for container removal")
+	deletePods(f, []string{pod.Name})
+	waitForContainerRemoval(pod.Spec.Containers[0].Name, pod.Name, pod.Namespace)
+	waitForContainerRemoval(pod.Spec.Containers[1].Name, pod.Name, pod.Namespace)
+
+	ginkgo.By("running multiple Gu pods")
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "gu-container1",
+			cpuRequest: "1000m",
+			cpuLimit:   "1000m",
+		},
+	}
+	pod1 = makeTopologyManagerPod("gu-pod1", ctnAttrs)
+	pod1 = f.PodClient().CreateSync(pod1)
+
+	ctnAttrs = []tmCtnAttribute{
+		{
+			ctnName:    "gu-container2",
+			cpuRequest: "1000m",
+			cpuLimit:   "1000m",
+		},
+	}
+	pod2 = makeTopologyManagerPod("gu-pod2", ctnAttrs)
+	pod2 = f.PodClient().CreateSync(pod2)
+
+	ginkgo.By("checking if the expected cpuset was assigned")
+	cpu1, cpu2 = 1, 2
+	if isHTEnabled() {
+		cpuList = cpuset.MustParse(getCPUSiblingList(0)).ToSlice()
+		if cpuList[1] != 1 {
+			cpu1, cpu2 = cpuList[1], 1
+		}
+	}
+
+	expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu1)
+	err = f.PodClient().MatchContainerOutput(pod1.Name, pod1.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod1.Spec.Containers[0].Name, pod1.Name)
+
+	expAllowedCPUsListRegex = fmt.Sprintf("^%d\n$", cpu2)
+	err = f.PodClient().MatchContainerOutput(pod2.Name, pod2.Spec.Containers[0].Name, expAllowedCPUsListRegex)
+	framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]",
+		pod2.Spec.Containers[0].Name, pod2.Name)
+
+	ginkgo.By("by deleting the pods and waiting for container removal")
+	deletePods(f, []string{pod1.Name, pod2.Name})
+	waitForContainerRemoval(pod1.Spec.Containers[0].Name, pod1.Name, pod1.Namespace)
+	waitForContainerRemoval(pod2.Spec.Containers[0].Name, pod2.Name, pod2.Namespace)
+}
+
+func runTopologyManagerTests(f *framework.Framework) {
+	var oldCfg *kubeletconfig.KubeletConfiguration
+
+	ginkgo.It("run Topology Manager test suite", func() {
+
+		var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted,
+			topologymanager.PolicyBestEffort, topologymanager.PolicyNone}
+
+		for _, policy := range policies {
+			// Configure Topology Manager
+			ginkgo.By("by configuring Topology Manager policy to xxx")
+			framework.Logf( "Configuring topology Manager policy to %s", policy)
+			configureTopologyManagerInKubelet(f, policy)
+			// Run the tests
+			runTopologyManagerSuiteTests(f)
+		}
+		// restore kubelet config
+		setOldKubeletConfig(f, oldCfg)
+
+		// Debug sleep to allow time to look at kubelet config
+		time.Sleep(5 * time.Minute)
+
+		// Delete state file to allow repeated runs
+	    deleteStateFile()
+	})
+}
+
+// Serial because the test updates kubelet configuration.
+var _ = SIGDescribe("Topology Manager [Serial] [Feature:TopologyManager][NodeAlphaFeature:TopologyManager]", func() {
+	f := framework.NewDefaultFramework("topology-manager-test")
+
+	ginkgo.Context("With kubeconfig updated to static CPU Manager policy run the Topology Manager tests", func() {
+		runTopologyManagerTests(f)
+	})
+
+})