From 56106439cf79567862b1a57948cbd79409a16021 Mon Sep 17 00:00:00 2001
From: Francesco Romani <fromani@redhat.com>
Date: Tue, 3 Nov 2020 23:14:19 +0100
Subject: [PATCH] node: e2e: bring up/down SRIOV DP just once

The e2e topology manager want to test the resource alignment using
devices, and the easiest devices to use are the SRIOV devices at this
moment.

The resource alignment test cases are run for each supported policies,
in a loop.

The tests manage the SRIOV device plugin; up until now, the plugin
was set up and tore down at each loop.
There is no real need for that. Each loop must reconfigure (thus
restart) the kubelet, but the device plugin can set up and tore down
just once for all the policies, thus once.
The kubelet can reconnect just fine to a running device plugin.

This way, we greatly reduce the interactions and the complexity of the
test environment, making it easier to understand and more robust, and
we trim down some minutes from execution time.

However, this patch also hides (not solves) a test flake we observed
on some environment. The issue is hardly reproduceable and not well
understood, but seems caused by doing the sriov dp setup/teardown
in each policy testing loop.
Investigation so far suggests that the kubelet sometimes have a stale
state after the sriovdp teardown/setup cycle, leading to flakes and
false negatives.
We tried to address this in https://github.com/kubernetes/kubernetes/pull/95611
with no conclusive results yet.

This patch was posted because overall we believe this patch gains
exceeds the drawbacks (hiding the aforementioned flake) and
because understanding the potential interaction issues between the
sriovdp and the kubelet deserve a separate test.

Signed-off-by: Francesco Romani <fromani@redhat.com>
---
 test/e2e_node/topology_manager_test.go | 42 +++++++++++++++-----------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/test/e2e_node/topology_manager_test.go b/test/e2e_node/topology_manager_test.go
index 8ef768c13cc..a40a66b70e0 100644
--- a/test/e2e_node/topology_manager_test.go
+++ b/test/e2e_node/topology_manager_test.go
@@ -531,6 +531,16 @@ func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sr
 	}
 	framework.ExpectNoError(err)
 
+	return &sriovData{
+		configMap:      configMap,
+		serviceAccount: serviceAccount,
+		pod:            dpPod,
+	}
+}
+
+// waitForSRIOVResources waits until enough SRIOV resources are avaailable, expecting to complete within the timeout.
+// if exits successfully, updates the sriovData with the resources which were found.
+func waitForSRIOVResources(f *framework.Framework, sd *sriovData) {
 	sriovResourceName := ""
 	var sriovResourceAmount int64
 	ginkgo.By("Waiting for devices to become available on the local node")
@@ -539,15 +549,10 @@ func setupSRIOVConfigOrFail(f *framework.Framework, configMap *v1.ConfigMap) *sr
 		sriovResourceName, sriovResourceAmount = findSRIOVResource(node)
 		return sriovResourceAmount > minSriovResource
 	}, 2*time.Minute, framework.Poll).Should(gomega.BeTrue())
-	framework.Logf("Successfully created device plugin pod, detected %d SRIOV allocatable devices %q", sriovResourceAmount, sriovResourceName)
 
-	return &sriovData{
-		configMap:      configMap,
-		serviceAccount: serviceAccount,
-		pod:            dpPod,
-		resourceName:   sriovResourceName,
-		resourceAmount: sriovResourceAmount,
-	}
+	sd.resourceName = sriovResourceName
+	sd.resourceAmount = sriovResourceAmount
+	framework.Logf("Detected SRIOV allocatable devices name=%q amount=%d", sd.resourceName, sd.resourceAmount)
 }
 
 func teardownSRIOVConfigOrFail(f *framework.Framework, sd *sriovData) {
@@ -672,14 +677,13 @@ func runTMScopeResourceAlignmentTestSuite(f *framework.Framework, configMap *v1.
 	teardownSRIOVConfigOrFail(f, sd)
 }
 
-func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs, policy string, numaNodes, coreCount int) {
+func runTopologyManagerNodeAlignmentSuiteTests(f *framework.Framework, sd *sriovData, reservedSystemCPUs, policy string, numaNodes, coreCount int) {
 	threadsPerCore := 1
 	if isHTEnabled() {
 		threadsPerCore = 2
 	}
 
-	sd := setupSRIOVConfigOrFail(f, configMap)
-	defer teardownSRIOVConfigOrFail(f, sd)
+	waitForSRIOVResources(f, sd)
 
 	envInfo := &testEnvInfo{
 		numaNodes:         numaNodes,
@@ -855,13 +859,17 @@ func runTopologyManagerTests(f *framework.Framework) {
 	var oldCfg *kubeletconfig.KubeletConfiguration
 	var err error
 
+	var policies = []string{
+		topologymanager.PolicySingleNumaNode,
+		topologymanager.PolicyRestricted,
+		topologymanager.PolicyBestEffort,
+		topologymanager.PolicyNone,
+	}
+
 	ginkgo.It("run Topology Manager policy test suite", func() {
 		oldCfg, err = getCurrentKubeletConfig()
 		framework.ExpectNoError(err)
 
-		var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted,
-			topologymanager.PolicyBestEffort, topologymanager.PolicyNone}
-
 		scope := containerScopeTopology
 		for _, policy := range policies {
 			// Configure Topology Manager
@@ -901,8 +909,8 @@ func runTopologyManagerTests(f *framework.Framework) {
 		oldCfg, err = getCurrentKubeletConfig()
 		framework.ExpectNoError(err)
 
-		var policies = []string{topologymanager.PolicySingleNumaNode, topologymanager.PolicyRestricted,
-			topologymanager.PolicyBestEffort, topologymanager.PolicyNone}
+		sd := setupSRIOVConfigOrFail(f, configMap)
+		defer teardownSRIOVConfigOrFail(f, sd)
 
 		scope := containerScopeTopology
 		for _, policy := range policies {
@@ -912,7 +920,7 @@ func runTopologyManagerTests(f *framework.Framework) {
 
 			reservedSystemCPUs := configureTopologyManagerInKubelet(f, oldCfg, policy, scope, configMap, numaNodes)
 
-			runTopologyManagerNodeAlignmentSuiteTests(f, configMap, reservedSystemCPUs, policy, numaNodes, coreCount)
+			runTopologyManagerNodeAlignmentSuiteTests(f, sd, reservedSystemCPUs, policy, numaNodes, coreCount)
 		}
 
 		// restore kubelet config