Merge pull request #129951 from parkjeongryul/add-e2e-topology-manager-for-init-ctn

Add e2e test for topology manager with restartable init containers
This commit is contained in:
Kubernetes Prow Robot 2025-02-28 04:38:23 -08:00 committed by GitHub
commit 88d2355c41
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -69,6 +69,7 @@ type tmCtnAttribute struct {
deviceName string deviceName string
deviceRequest string deviceRequest string
deviceLimit string deviceLimit string
restartPolicy *v1.ContainerRestartPolicy
} }
func detectNUMANodes() int { func detectNUMANodes() int {
@ -159,6 +160,7 @@ func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Co
}, },
}, },
Command: []string{"sh", "-c", ctnCmd}, Command: []string{"sh", "-c", ctnCmd},
RestartPolicy: ctnAttr.restartPolicy,
} }
if ctnAttr.deviceName != "" { if ctnAttr.deviceName != "" {
ctn.Resources.Requests[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceRequest) ctn.Resources.Requests[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceRequest)
@ -171,8 +173,12 @@ func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Co
func makeTopologyManagerTestPod(podName string, tmCtnAttributes, tmInitCtnAttributes []tmCtnAttribute) *v1.Pod { func makeTopologyManagerTestPod(podName string, tmCtnAttributes, tmInitCtnAttributes []tmCtnAttribute) *v1.Pod {
var containers, initContainers []v1.Container var containers, initContainers []v1.Container
if len(tmInitCtnAttributes) > 0 { for _, attr := range tmInitCtnAttributes {
initContainers = makeContainers(numaAlignmentCommand, tmInitCtnAttributes) cmd := numaAlignmentCommand
if attr.restartPolicy != nil && *attr.restartPolicy == v1.ContainerRestartPolicyAlways {
cmd = numaAlignmentSleepCommand
}
initContainers = append(initContainers, makeContainers(cmd, []tmCtnAttribute{attr})...)
} }
containers = makeContainers(numaAlignmentSleepCommand, tmCtnAttributes) containers = makeContainers(numaAlignmentSleepCommand, tmCtnAttributes)
@ -346,6 +352,25 @@ func findSRIOVResource(node *v1.Node) (string, int64) {
} }
func validatePodAlignment(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) { func validatePodAlignment(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) {
for _, cnt := range pod.Spec.InitContainers {
// only check restartable init containers, skip regular init containers
if cnt.RestartPolicy == nil || *cnt.RestartPolicy != v1.ContainerRestartPolicyAlways {
continue
}
ginkgo.By(fmt.Sprintf("validating the init container %s on Gu pod %s", cnt.Name, pod.Name))
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
framework.ExpectNoError(err, "expected log not found in init container [%s] of pod [%s]", cnt.Name, pod.Name)
framework.Logf("got init container logs: %v", logs)
numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, envInfo)
framework.ExpectNoError(err, "NUMA Alignment check failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
if numaRes != nil {
framework.Logf("NUMA resources for init container %s/%s: %s", pod.Name, cnt.Name, numaRes.String())
}
}
for _, cnt := range pod.Spec.Containers { for _, cnt := range pod.Spec.Containers {
ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name)) ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name))
@ -367,6 +392,23 @@ func validatePodAlignmentWithPodScope(ctx context.Context, f *framework.Framewor
podsNUMA := make(map[int]int) podsNUMA := make(map[int]int)
ginkgo.By(fmt.Sprintf("validate pod scope alignment for %s pod", pod.Name)) ginkgo.By(fmt.Sprintf("validate pod scope alignment for %s pod", pod.Name))
for _, cnt := range pod.Spec.InitContainers {
// only check restartable init containers, skip regular init containers
if cnt.RestartPolicy == nil || *cnt.RestartPolicy != v1.ContainerRestartPolicyAlways {
continue
}
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
framework.ExpectNoError(err, "NUMA alignment failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
envMap, err := makeEnvMap(logs)
framework.ExpectNoError(err, "NUMA alignment failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
cpuToNUMA, err := getCPUToNUMANodeMapFromEnv(f, pod, &cnt, envMap, envInfo.numaNodes)
framework.ExpectNoError(err, "NUMA alignment failed for init container [%s] of pod [%s]", cnt.Name, pod.Name)
for cpuID, numaID := range cpuToNUMA {
podsNUMA[cpuID] = numaID
}
}
for _, cnt := range pod.Spec.Containers { for _, cnt := range pod.Spec.Containers {
logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name) logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name) framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name)
@ -440,7 +482,7 @@ func runTopologyManagerPositiveTest(ctx context.Context, f *framework.Framework,
} }
// per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/693-topology-manager/README.md#multi-numa-systems-tests // per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/693-topology-manager/README.md#multi-numa-systems-tests
// we can do a menaingful validation only when using the single-numa node policy // we can do a meaningful validation only when using the single-numa node policy
if envInfo.policy == topologymanager.PolicySingleNumaNode { if envInfo.policy == topologymanager.PolicySingleNumaNode {
for _, pod := range podMap { for _, pod := range podMap {
validatePodAlignment(ctx, f, pod, envInfo) validatePodAlignment(ctx, f, pod, envInfo)
@ -733,6 +775,94 @@ func runTMScopeResourceAlignmentTestSuite(ctx context.Context, f *framework.Fram
} }
runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Admit one guaranteed pod with restartable init container, 1 core and 1 %s device", sd.resourceName))
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Admit one guaranteed pod with multiple restartable init containers, each container with 1 CPU core. Use 1 %s device", sd.resourceName))
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container-1",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
{
ctnName: "restartable-init-container-2",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
coresReq = fmt.Sprintf("%dm", (numCores/2+1)*1000)
ginkgo.By(fmt.Sprintf("Trying to admin guaranteed pod with two restartable init containers where sum of their CPU requests (%d cores) exceeds NUMA capacity. The request should be rejected", (numCores/2+1)*2))
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container-1",
cpuRequest: coresReq,
cpuLimit: coresReq,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
{
ctnName: "restartable-init-container-2",
cpuRequest: coresReq,
cpuLimit: coresReq,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
teardownSRIOVConfigOrFail(ctx, f, sd) teardownSRIOVConfigOrFail(ctx, f, sd)
} }
@ -825,6 +955,30 @@ func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework
} }
runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with restartable init container - each with 1 core, 1 %s device", sd.resourceName))
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
// testing more complex conditions require knowledge about the system cpu+bus topology // testing more complex conditions require knowledge about the system cpu+bus topology
} }
@ -889,6 +1043,39 @@ func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework
}, },
} }
runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
ginkgo.By(fmt.Sprintf("Successfully admit pod with multiple restartable init containers, each with 1 core, 1 %s device", sd.resourceName))
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container-1",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
{
ctnName: "restartable-init-container-2",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
} }
// this is the only policy that can guarantee reliable rejects // this is the only policy that can guarantee reliable rejects
@ -908,6 +1095,65 @@ func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework
}, },
} }
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo) runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
if sd.resourceAmount >= 3 {
ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pod with a restartable init container demanding %d cores, 1 %s device - and it should be rejected", numCores, sd.resourceName))
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container",
cpuRequest: excessCoresReq,
cpuLimit: excessCoresReq,
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
ginkgo.By("Trying to admit a guaranteed pod with two restartable init containers where the second one cannot achieve NUMA alignment - and it should be rejected")
initCtnAttrs = []tmCtnAttribute{
{
ctnName: "restartable-init-container-1",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
{
ctnName: "restartable-init-container-2",
cpuRequest: excessCoresReq,
cpuLimit: excessCoresReq,
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
restartPolicy: &containerRestartPolicyAlways,
},
}
ctnAttrs = []tmCtnAttribute{
{
ctnName: "gu-container",
cpuRequest: "1000m",
cpuLimit: "1000m",
deviceName: sd.resourceName,
deviceRequest: "1",
deviceLimit: "1",
},
}
runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
}
} }
} }