test: parity between cluster and node IPPR e2e tests

Some IPPR cluster e2e tests are missing from node e2e tests. This change brings parity between them.
2025-09-13 21:25:09 +00:00 · 2024-10-17 05:09:56 -07:00
parent d67e6545b1
commit 6203006348
1 changed files with 211 additions and 45 deletions
--- a/test/e2e_node/pod_resize_test.go
+++ b/test/e2e_node/pod_resize_test.go
@@ -54,6 +54,8 @@ const (
 	Cgroupv2CPURequest         string = "/sys/fs/cgroup/cpu.weight"
 	CPUPeriod                  string = "100000"
 	MinContainerRuntimeVersion string = "1.6.9"
+
+	fakeExtendedResource = "dummy.com/dummy"
 )

 var (
@@ -64,18 +66,21 @@ var (
 )

 type ContainerResources struct {
-	CPUReq     string
-	CPULim     string
-	MemReq     string
-	MemLim     string
-	EphStorReq string
-	EphStorLim string
+	CPUReq              string
+	CPULim              string
+	MemReq              string
+	MemLim              string
+	EphStorReq          string
+	EphStorLim          string
+	ExtendedResourceReq string
+	ExtendedResourceLim string
 }

 type ContainerAllocations struct {
-	CPUAlloc     string
-	MemAlloc     string
-	ephStorAlloc string
+	CPUAlloc              string
+	MemAlloc              string
+	ephStorAlloc          string
+	ExtendedResourceAlloc string
 }

 type TestContainerInfo struct {
@@ -87,6 +92,28 @@ type TestContainerInfo struct {
 	RestartCount int32
 }

+type containerPatch struct {
+	Name      string `json:"name"`
+	Resources struct {
+		Requests struct {
+			CPU     string `json:"cpu,omitempty"`
+			Memory  string `json:"memory,omitempty"`
+			EphStor string `json:"ephemeral-storage,omitempty"`
+		} `json:"requests"`
+		Limits struct {
+			CPU     string `json:"cpu,omitempty"`
+			Memory  string `json:"memory,omitempty"`
+			EphStor string `json:"ephemeral-storage,omitempty"`
+		} `json:"limits"`
+	} `json:"resources"`
+}
+
+type patchSpec struct {
+	Spec struct {
+		Containers []containerPatch `json:"containers"`
+	} `json:"spec"`
+}
+
 func supportsInPlacePodVerticalScaling(ctx context.Context, f *framework.Framework) bool {
 	node := getLocalNode(ctx, f)
 	re := regexp.MustCompile("containerd://(.*)")
@@ -418,6 +445,100 @@ func waitForPodResizeActuation(ctx context.Context, f *framework.Framework, c cl
 	return resizedPod
 }

+func genPatchString(containers []TestContainerInfo) (string, error) {
+	var patch patchSpec
+
+	for _, container := range containers {
+		var cPatch containerPatch
+		cPatch.Name = container.Name
+		cPatch.Resources.Requests.CPU = container.Resources.CPUReq
+		cPatch.Resources.Requests.Memory = container.Resources.MemReq
+		cPatch.Resources.Limits.CPU = container.Resources.CPULim
+		cPatch.Resources.Limits.Memory = container.Resources.MemLim
+
+		patch.Spec.Containers = append(patch.Spec.Containers, cPatch)
+	}
+
+	patchBytes, err := json.Marshal(patch)
+	if err != nil {
+		return "", err
+	}
+
+	return string(patchBytes), nil
+}
+
+func patchNode(ctx context.Context, client clientset.Interface, old *v1.Node, new *v1.Node) error {
+	oldData, err := json.Marshal(old)
+	if err != nil {
+		return err
+	}
+
+	newData, err := json.Marshal(new)
+	if err != nil {
+		return err
+	}
+	patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Node{})
+	if err != nil {
+		return fmt.Errorf("failed to create merge patch for node %q: %w", old.Name, err)
+	}
+	_, err = client.CoreV1().Nodes().Patch(ctx, old.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
+	return err
+}
+
+func addExtendedResource(clientSet clientset.Interface, nodeName, extendedResourceName string, extendedResourceQuantity resource.Quantity) {
+	extendedResource := v1.ResourceName(extendedResourceName)
+
+	ginkgo.By("Adding a custom resource")
+	OriginalNode, err := clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
+	framework.ExpectNoError(err)
+
+	node := OriginalNode.DeepCopy()
+	node.Status.Capacity[extendedResource] = extendedResourceQuantity
+	node.Status.Allocatable[extendedResource] = extendedResourceQuantity
+	err = patchNode(context.Background(), clientSet, OriginalNode.DeepCopy(), node)
+	framework.ExpectNoError(err)
+
+	gomega.Eventually(func() error {
+		node, err = clientSet.CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{})
+		framework.ExpectNoError(err)
+
+		fakeResourceCapacity, exists := node.Status.Capacity[extendedResource]
+		if !exists {
+			return fmt.Errorf("node %s has no %s resource capacity", node.Name, extendedResourceName)
+		}
+		if expectedResource := resource.MustParse("123"); fakeResourceCapacity.Cmp(expectedResource) != 0 {
+			return fmt.Errorf("node %s has resource capacity %s, expected: %s", node.Name, fakeResourceCapacity.String(), expectedResource.String())
+		}
+
+		return nil
+	}).WithTimeout(30 * time.Second).WithPolling(time.Second).ShouldNot(gomega.HaveOccurred())
+}
+
+func removeExtendedResource(clientSet clientset.Interface, nodeName, extendedResourceName string) {
+	extendedResource := v1.ResourceName(extendedResourceName)
+
+	ginkgo.By("Removing a custom resource")
+	originalNode, err := clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
+	framework.ExpectNoError(err)
+
+	node := originalNode.DeepCopy()
+	delete(node.Status.Capacity, extendedResource)
+	delete(node.Status.Allocatable, extendedResource)
+	err = patchNode(context.Background(), clientSet, originalNode.DeepCopy(), node)
+	framework.ExpectNoError(err)
+
+	gomega.Eventually(func() error {
+		node, err = clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
+		framework.ExpectNoError(err)
+
+		if _, exists := node.Status.Capacity[extendedResource]; exists {
+			return fmt.Errorf("node %s has resource capacity %s which is expected to be removed", node.Name, extendedResourceName)
+		}
+
+		return nil
+	}).WithTimeout(30 * time.Second).WithPolling(time.Second).ShouldNot(gomega.HaveOccurred())
+}
+
 func doPodResizeTests() {
 	f := framework.NewDefaultFramework("pod-resize-test")
 	var podClient *e2epod.PodClient
@@ -426,10 +547,11 @@ func doPodResizeTests() {
 	})

 	type testCase struct {
-		name        string
-		containers  []TestContainerInfo
-		patchString string
-		expected    []TestContainerInfo
+		name                string
+		containers          []TestContainerInfo
+		patchString         string
+		expected            []TestContainerInfo
+		addExtendedResource bool
 	}

 	noRestart := v1.NotRequired
@@ -1131,6 +1253,31 @@ func doPodResizeTests() {
 				},
 			},
 		},
+		{
+			name: "Guaranteed QoS pod, one container - increase CPU & memory with an extended resource",
+			containers: []TestContainerInfo{
+				{
+					Name: "c1",
+					Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "200Mi", MemLim: "200Mi",
+						ExtendedResourceReq: "1", ExtendedResourceLim: "1"},
+					CPUPolicy: &noRestart,
+					MemPolicy: &noRestart,
+				},
+			},
+			patchString: `{"spec":{"containers":[
+					{"name":"c1", "resources":{"requests":{"cpu":"200m","memory":"400Mi"},"limits":{"cpu":"200m","memory":"400Mi"}}}
+					]}}`,
+			expected: []TestContainerInfo{
+				{
+					Name: "c1",
+					Resources: &ContainerResources{CPUReq: "200m", CPULim: "200m", MemReq: "400Mi", MemLim: "400Mi",
+						ExtendedResourceReq: "1", ExtendedResourceLim: "1"},
+					CPUPolicy: &noRestart,
+					MemPolicy: &noRestart,
+				},
+			},
+			addExtendedResource: true,
+		},
 	}

 	timeouts := framework.NewTimeoutContext()
@@ -1153,6 +1300,20 @@ func doPodResizeTests() {
 			testPod = makeTestPod(f.Namespace.Name, "testpod", tStamp, tc.containers)
 			testPod = e2epod.MustMixinRestrictedPodSecurity(testPod)

+			if tc.addExtendedResource {
+				nodes, err := e2enode.GetReadySchedulableNodes(context.Background(), f.ClientSet)
+				framework.ExpectNoError(err)
+
+				for _, node := range nodes.Items {
+					addExtendedResource(f.ClientSet, node.Name, fakeExtendedResource, resource.MustParse("123"))
+				}
+				defer func() {
+					for _, node := range nodes.Items {
+						removeExtendedResource(f.ClientSet, node.Name, fakeExtendedResource)
+					}
+				}()
+			}
+
 			ginkgo.By("creating pod")
 			newPod := podClient.CreateSync(ctx, testPod)

@@ -1161,41 +1322,49 @@ func doPodResizeTests() {
 			ginkgo.By("verifying initial pod resize policy is as expected")
 			verifyPodResizePolicy(newPod, tc.containers)

-			err := e2epod.WaitForPodCondition(ctx, f.ClientSet, newPod.Namespace, newPod.Name, "Ready", timeouts.PodStartShort, testutils.PodRunningReady)
-			framework.ExpectNoError(err, "pod %s/%s did not go running", newPod.Namespace, newPod.Name)
-			framework.Logf("pod %s/%s running", newPod.Namespace, newPod.Name)
-
 			ginkgo.By("verifying initial pod status resources")
 			verifyPodStatusResources(newPod, tc.containers)

-			ginkgo.By("patching pod for resize")
-			patchedPod, pErr = f.ClientSet.CoreV1().Pods(newPod.Namespace).Patch(ctx, newPod.Name,
-				types.StrategicMergePatchType, []byte(tc.patchString), metav1.PatchOptions{})
-			framework.ExpectNoError(pErr, "failed to patch pod for resize")
+			ginkgo.By("verifying initial cgroup config are as expected")
+			framework.ExpectNoError(verifyPodContainersCgroupValues(ctx, f, newPod, tc.containers))

-			ginkgo.By("verifying pod patched for resize")
-			verifyPodResources(patchedPod, tc.expected)
-			gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll).
-				WithArguments(patchedPod, tc.containers).
-				Should(gomega.BeNil(), "failed to verify Pod allocations for patchedPod")
+			patchAndVerify := func(patchString string, expectedContainers []TestContainerInfo, initialContainers []TestContainerInfo, opStr string, isRollback bool) {
+				ginkgo.By(fmt.Sprintf("patching pod for %s", opStr))
+				patchedPod, pErr = f.ClientSet.CoreV1().Pods(newPod.Namespace).Patch(context.TODO(), newPod.Name,
+					types.StrategicMergePatchType, []byte(patchString), metav1.PatchOptions{})
+				framework.ExpectNoError(pErr, fmt.Sprintf("failed to patch pod for %s", opStr))

-			ginkgo.By("waiting for resize to be actuated")
-			resizedPod := waitForPodResizeActuation(ctx, f, f.ClientSet, podClient, newPod, patchedPod, tc.expected)
+				ginkgo.By(fmt.Sprintf("verifying pod patched for %s", opStr))
+				verifyPodResources(patchedPod, expectedContainers)
+				gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll).
+					WithArguments(patchedPod, initialContainers).
+					Should(gomega.BeNil(), "failed to verify Pod allocations for patchedPod")

-			ginkgo.By("verifying pod resources after resize")
-			verifyPodResources(resizedPod, tc.expected)
+				ginkgo.By(fmt.Sprintf("waiting for %s to be actuated", opStr))
+				resizedPod := waitForPodResizeActuation(ctx, f, podClient, newPod, patchedPod, expectedContainers, initialContainers, isRollback)

-			ginkgo.By("verifying pod allocations after resize")
-			gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll).
-				WithArguments(resizedPod, tc.expected).
-				Should(gomega.BeNil(), "failed to verify Pod allocations for resizedPod")
+				// Check cgroup values only for containerd versions before 1.6.9
+				ginkgo.By(fmt.Sprintf("verifying pod container's cgroup values after %s", opStr))
+				framework.ExpectNoError(verifyPodContainersCgroupValues(ctx, f, resizedPod, expectedContainers))
+
+				ginkgo.By(fmt.Sprintf("verifying pod resources after %s", opStr))
+				verifyPodResources(resizedPod, expectedContainers)
+
+				ginkgo.By(fmt.Sprintf("verifying pod allocations after %s", opStr))
+				gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll).
+					WithArguments(resizedPod, expectedContainers).
+					Should(gomega.BeNil(), "failed to verify Pod allocations for resizedPod")
+			}
+
+			patchAndVerify(tc.patchString, tc.expected, tc.containers, "resize", false)
+
+			rbPatchStr, err := genPatchString(tc.containers)
+			framework.ExpectNoError(err)
+			// Resize has been actuated, test rollback
+			patchAndVerify(rbPatchStr, tc.containers, tc.expected, "rollback", true)

 			ginkgo.By("deleting pod")
-			deletePodSyncByName(ctx, f, newPod.Name)
-			// we need to wait for all containers to really be gone so cpumanager reconcile loop will not rewrite the cpu_manager_state.
-			// this is in turn needed because we will have an unavoidable (in the current framework) race with the
-			// reconcile loop which will make our attempt to delete the state file and to restore the old config go haywire
-			waitForAllContainerRemoval(ctx, newPod.Name, newPod.Namespace)
+			podClient.DeleteSync(ctx, newPod.Name, metav1.DeleteOptions{}, timeouts.PodDelete)
 		})
 	}
 }
@@ -1286,11 +1455,8 @@ func doPodResizeErrorTests() {
 				WithArguments(patchedPod, tc.expected).
 				Should(gomega.BeNil(), "failed to verify Pod allocations for patchedPod")

-			deletePodSyncByName(ctx, f, newPod.Name)
-			// we need to wait for all containers to really be gone so cpumanager reconcile loop will not rewrite the cpu_manager_state.
-			// this is in turn needed because we will have an unavoidable (in the current framework) race with the
-			// reconcile loop which will make our attempt to delete the state file and to restore the old config go haywire
-			waitForAllContainerRemoval(ctx, newPod.Name, newPod.Namespace)
+			ginkgo.By("deleting pod")
+			podClient.DeleteSync(ctx, newPod.Name, metav1.DeleteOptions{}, timeouts.PodDelete)
 		})
 	}
 }
@@ -1301,7 +1467,7 @@ func doPodResizeErrorTests() {
 //          b) api-server in services doesn't start with --enable-admission-plugins=ResourceQuota
 //             and is not possible to start it from TEST_ARGS
 //       Above tests are performed by doSheduletTests() and doPodResizeResourceQuotaTests()
-//       in test/node/pod_resize_test.go
+//       in test/e2e/node/pod_resize.go

 var _ = SIGDescribe("Pod InPlace Resize Container", framework.WithSerial(), feature.InPlacePodVerticalScaling, "[NodeAlphaFeature:InPlacePodVerticalScaling]", func() {
 	if !podOnCgroupv2Node {