diff --git a/test/e2e_node/pod_resize_test.go b/test/e2e_node/pod_resize_test.go index 61c8a741140..56f7f35315a 100644 --- a/test/e2e_node/pod_resize_test.go +++ b/test/e2e_node/pod_resize_test.go @@ -54,6 +54,8 @@ const ( Cgroupv2CPURequest string = "/sys/fs/cgroup/cpu.weight" CPUPeriod string = "100000" MinContainerRuntimeVersion string = "1.6.9" + + fakeExtendedResource = "dummy.com/dummy" ) var ( @@ -64,18 +66,21 @@ var ( ) type ContainerResources struct { - CPUReq string - CPULim string - MemReq string - MemLim string - EphStorReq string - EphStorLim string + CPUReq string + CPULim string + MemReq string + MemLim string + EphStorReq string + EphStorLim string + ExtendedResourceReq string + ExtendedResourceLim string } type ContainerAllocations struct { - CPUAlloc string - MemAlloc string - ephStorAlloc string + CPUAlloc string + MemAlloc string + ephStorAlloc string + ExtendedResourceAlloc string } type TestContainerInfo struct { @@ -87,6 +92,28 @@ type TestContainerInfo struct { RestartCount int32 } +type containerPatch struct { + Name string `json:"name"` + Resources struct { + Requests struct { + CPU string `json:"cpu,omitempty"` + Memory string `json:"memory,omitempty"` + EphStor string `json:"ephemeral-storage,omitempty"` + } `json:"requests"` + Limits struct { + CPU string `json:"cpu,omitempty"` + Memory string `json:"memory,omitempty"` + EphStor string `json:"ephemeral-storage,omitempty"` + } `json:"limits"` + } `json:"resources"` +} + +type patchSpec struct { + Spec struct { + Containers []containerPatch `json:"containers"` + } `json:"spec"` +} + func supportsInPlacePodVerticalScaling(ctx context.Context, f *framework.Framework) bool { node := getLocalNode(ctx, f) re := regexp.MustCompile("containerd://(.*)") @@ -418,6 +445,100 @@ func waitForPodResizeActuation(ctx context.Context, f *framework.Framework, c cl return resizedPod } +func genPatchString(containers []TestContainerInfo) (string, error) { + var patch patchSpec + + for _, container := range containers { + var cPatch containerPatch + cPatch.Name = container.Name + cPatch.Resources.Requests.CPU = container.Resources.CPUReq + cPatch.Resources.Requests.Memory = container.Resources.MemReq + cPatch.Resources.Limits.CPU = container.Resources.CPULim + cPatch.Resources.Limits.Memory = container.Resources.MemLim + + patch.Spec.Containers = append(patch.Spec.Containers, cPatch) + } + + patchBytes, err := json.Marshal(patch) + if err != nil { + return "", err + } + + return string(patchBytes), nil +} + +func patchNode(ctx context.Context, client clientset.Interface, old *v1.Node, new *v1.Node) error { + oldData, err := json.Marshal(old) + if err != nil { + return err + } + + newData, err := json.Marshal(new) + if err != nil { + return err + } + patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Node{}) + if err != nil { + return fmt.Errorf("failed to create merge patch for node %q: %w", old.Name, err) + } + _, err = client.CoreV1().Nodes().Patch(ctx, old.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status") + return err +} + +func addExtendedResource(clientSet clientset.Interface, nodeName, extendedResourceName string, extendedResourceQuantity resource.Quantity) { + extendedResource := v1.ResourceName(extendedResourceName) + + ginkgo.By("Adding a custom resource") + OriginalNode, err := clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + + node := OriginalNode.DeepCopy() + node.Status.Capacity[extendedResource] = extendedResourceQuantity + node.Status.Allocatable[extendedResource] = extendedResourceQuantity + err = patchNode(context.Background(), clientSet, OriginalNode.DeepCopy(), node) + framework.ExpectNoError(err) + + gomega.Eventually(func() error { + node, err = clientSet.CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + fakeResourceCapacity, exists := node.Status.Capacity[extendedResource] + if !exists { + return fmt.Errorf("node %s has no %s resource capacity", node.Name, extendedResourceName) + } + if expectedResource := resource.MustParse("123"); fakeResourceCapacity.Cmp(expectedResource) != 0 { + return fmt.Errorf("node %s has resource capacity %s, expected: %s", node.Name, fakeResourceCapacity.String(), expectedResource.String()) + } + + return nil + }).WithTimeout(30 * time.Second).WithPolling(time.Second).ShouldNot(gomega.HaveOccurred()) +} + +func removeExtendedResource(clientSet clientset.Interface, nodeName, extendedResourceName string) { + extendedResource := v1.ResourceName(extendedResourceName) + + ginkgo.By("Removing a custom resource") + originalNode, err := clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + + node := originalNode.DeepCopy() + delete(node.Status.Capacity, extendedResource) + delete(node.Status.Allocatable, extendedResource) + err = patchNode(context.Background(), clientSet, originalNode.DeepCopy(), node) + framework.ExpectNoError(err) + + gomega.Eventually(func() error { + node, err = clientSet.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{}) + framework.ExpectNoError(err) + + if _, exists := node.Status.Capacity[extendedResource]; exists { + return fmt.Errorf("node %s has resource capacity %s which is expected to be removed", node.Name, extendedResourceName) + } + + return nil + }).WithTimeout(30 * time.Second).WithPolling(time.Second).ShouldNot(gomega.HaveOccurred()) +} + func doPodResizeTests() { f := framework.NewDefaultFramework("pod-resize-test") var podClient *e2epod.PodClient @@ -426,10 +547,11 @@ func doPodResizeTests() { }) type testCase struct { - name string - containers []TestContainerInfo - patchString string - expected []TestContainerInfo + name string + containers []TestContainerInfo + patchString string + expected []TestContainerInfo + addExtendedResource bool } noRestart := v1.NotRequired @@ -1131,6 +1253,31 @@ func doPodResizeTests() { }, }, }, + { + name: "Guaranteed QoS pod, one container - increase CPU & memory with an extended resource", + containers: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "100m", CPULim: "100m", MemReq: "200Mi", MemLim: "200Mi", + ExtendedResourceReq: "1", ExtendedResourceLim: "1"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + patchString: `{"spec":{"containers":[ + {"name":"c1", "resources":{"requests":{"cpu":"200m","memory":"400Mi"},"limits":{"cpu":"200m","memory":"400Mi"}}} + ]}}`, + expected: []TestContainerInfo{ + { + Name: "c1", + Resources: &ContainerResources{CPUReq: "200m", CPULim: "200m", MemReq: "400Mi", MemLim: "400Mi", + ExtendedResourceReq: "1", ExtendedResourceLim: "1"}, + CPUPolicy: &noRestart, + MemPolicy: &noRestart, + }, + }, + addExtendedResource: true, + }, } timeouts := framework.NewTimeoutContext() @@ -1153,6 +1300,20 @@ func doPodResizeTests() { testPod = makeTestPod(f.Namespace.Name, "testpod", tStamp, tc.containers) testPod = e2epod.MustMixinRestrictedPodSecurity(testPod) + if tc.addExtendedResource { + nodes, err := e2enode.GetReadySchedulableNodes(context.Background(), f.ClientSet) + framework.ExpectNoError(err) + + for _, node := range nodes.Items { + addExtendedResource(f.ClientSet, node.Name, fakeExtendedResource, resource.MustParse("123")) + } + defer func() { + for _, node := range nodes.Items { + removeExtendedResource(f.ClientSet, node.Name, fakeExtendedResource) + } + }() + } + ginkgo.By("creating pod") newPod := podClient.CreateSync(ctx, testPod) @@ -1161,41 +1322,49 @@ func doPodResizeTests() { ginkgo.By("verifying initial pod resize policy is as expected") verifyPodResizePolicy(newPod, tc.containers) - err := e2epod.WaitForPodCondition(ctx, f.ClientSet, newPod.Namespace, newPod.Name, "Ready", timeouts.PodStartShort, testutils.PodRunningReady) - framework.ExpectNoError(err, "pod %s/%s did not go running", newPod.Namespace, newPod.Name) - framework.Logf("pod %s/%s running", newPod.Namespace, newPod.Name) - ginkgo.By("verifying initial pod status resources") verifyPodStatusResources(newPod, tc.containers) - ginkgo.By("patching pod for resize") - patchedPod, pErr = f.ClientSet.CoreV1().Pods(newPod.Namespace).Patch(ctx, newPod.Name, - types.StrategicMergePatchType, []byte(tc.patchString), metav1.PatchOptions{}) - framework.ExpectNoError(pErr, "failed to patch pod for resize") + ginkgo.By("verifying initial cgroup config are as expected") + framework.ExpectNoError(verifyPodContainersCgroupValues(ctx, f, newPod, tc.containers)) - ginkgo.By("verifying pod patched for resize") - verifyPodResources(patchedPod, tc.expected) - gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll). - WithArguments(patchedPod, tc.containers). - Should(gomega.BeNil(), "failed to verify Pod allocations for patchedPod") + patchAndVerify := func(patchString string, expectedContainers []TestContainerInfo, initialContainers []TestContainerInfo, opStr string, isRollback bool) { + ginkgo.By(fmt.Sprintf("patching pod for %s", opStr)) + patchedPod, pErr = f.ClientSet.CoreV1().Pods(newPod.Namespace).Patch(context.TODO(), newPod.Name, + types.StrategicMergePatchType, []byte(patchString), metav1.PatchOptions{}) + framework.ExpectNoError(pErr, fmt.Sprintf("failed to patch pod for %s", opStr)) - ginkgo.By("waiting for resize to be actuated") - resizedPod := waitForPodResizeActuation(ctx, f, f.ClientSet, podClient, newPod, patchedPod, tc.expected) + ginkgo.By(fmt.Sprintf("verifying pod patched for %s", opStr)) + verifyPodResources(patchedPod, expectedContainers) + gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll). + WithArguments(patchedPod, initialContainers). + Should(gomega.BeNil(), "failed to verify Pod allocations for patchedPod") - ginkgo.By("verifying pod resources after resize") - verifyPodResources(resizedPod, tc.expected) + ginkgo.By(fmt.Sprintf("waiting for %s to be actuated", opStr)) + resizedPod := waitForPodResizeActuation(ctx, f, podClient, newPod, patchedPod, expectedContainers, initialContainers, isRollback) - ginkgo.By("verifying pod allocations after resize") - gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll). - WithArguments(resizedPod, tc.expected). - Should(gomega.BeNil(), "failed to verify Pod allocations for resizedPod") + // Check cgroup values only for containerd versions before 1.6.9 + ginkgo.By(fmt.Sprintf("verifying pod container's cgroup values after %s", opStr)) + framework.ExpectNoError(verifyPodContainersCgroupValues(ctx, f, resizedPod, expectedContainers)) + + ginkgo.By(fmt.Sprintf("verifying pod resources after %s", opStr)) + verifyPodResources(resizedPod, expectedContainers) + + ginkgo.By(fmt.Sprintf("verifying pod allocations after %s", opStr)) + gomega.Eventually(ctx, verifyPodAllocations, timeouts.PodStartShort, timeouts.Poll). + WithArguments(resizedPod, expectedContainers). + Should(gomega.BeNil(), "failed to verify Pod allocations for resizedPod") + } + + patchAndVerify(tc.patchString, tc.expected, tc.containers, "resize", false) + + rbPatchStr, err := genPatchString(tc.containers) + framework.ExpectNoError(err) + // Resize has been actuated, test rollback + patchAndVerify(rbPatchStr, tc.containers, tc.expected, "rollback", true) ginkgo.By("deleting pod") - deletePodSyncByName(ctx, f, newPod.Name) - // we need to wait for all containers to really be gone so cpumanager reconcile loop will not rewrite the cpu_manager_state. - // this is in turn needed because we will have an unavoidable (in the current framework) race with the - // reconcile loop which will make our attempt to delete the state file and to restore the old config go haywire - waitForAllContainerRemoval(ctx, newPod.Name, newPod.Namespace) + podClient.DeleteSync(ctx, newPod.Name, metav1.DeleteOptions{}, timeouts.PodDelete) }) } } @@ -1286,11 +1455,8 @@ func doPodResizeErrorTests() { WithArguments(patchedPod, tc.expected). Should(gomega.BeNil(), "failed to verify Pod allocations for patchedPod") - deletePodSyncByName(ctx, f, newPod.Name) - // we need to wait for all containers to really be gone so cpumanager reconcile loop will not rewrite the cpu_manager_state. - // this is in turn needed because we will have an unavoidable (in the current framework) race with the - // reconcile loop which will make our attempt to delete the state file and to restore the old config go haywire - waitForAllContainerRemoval(ctx, newPod.Name, newPod.Namespace) + ginkgo.By("deleting pod") + podClient.DeleteSync(ctx, newPod.Name, metav1.DeleteOptions{}, timeouts.PodDelete) }) } } @@ -1301,7 +1467,7 @@ func doPodResizeErrorTests() { // b) api-server in services doesn't start with --enable-admission-plugins=ResourceQuota // and is not possible to start it from TEST_ARGS // Above tests are performed by doSheduletTests() and doPodResizeResourceQuotaTests() -// in test/node/pod_resize_test.go +// in test/e2e/node/pod_resize.go var _ = SIGDescribe("Pod InPlace Resize Container", framework.WithSerial(), feature.InPlacePodVerticalScaling, "[NodeAlphaFeature:InPlacePodVerticalScaling]", func() { if !podOnCgroupv2Node {