test: Add E2E for job completions with cpu reservation

Create an E2E test that creates a job that spawns a pod that should
succeed. The job reserves a fixed amount of CPU and has a large number
of completions and parallelism. Use to repro github.com/kubernetes/kubernetes/issues/106884

Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
David Porter 2022-02-25 00:33:03 -08:00 committed by Clayton Coleman
parent ca98714ec0
commit c70f1955c4
No known key found for this signature in database
GPG Key ID: 3D16906B4F1C5CB3
2 changed files with 60 additions and 0 deletions

View File

@ -326,6 +326,14 @@ func findContainerStatus(status *v1.PodStatus, containerID string) (containerSta
}
// TerminatePod ensures that the status of containers is properly defaulted at the end of the pod
// lifecycle. As the Kubelet must reconcile with the container runtime to observe container status
// there is always the possibility we are unable to retrieve one or more container statuses due to
// garbage collection, admin action, or loss of temporary data on a restart. This method ensures
// that any absent container status is treated as a failure so that we do not incorrectly describe
// the pod as successful. If we have not yet initialized the pod in the presence of init containers,
// the init container failure status is sufficient to describe the pod as failing, and we do not need
// to override waiting containers (unless there is evidence the pod previously started those containers).
func (m *manager) TerminatePod(pod *v1.Pod) {
m.podStatusesLock.Lock()
defer m.podStatusesLock.Unlock()

View File

@ -25,6 +25,7 @@ import (
batchv1 "k8s.io/api/batch/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
@ -35,6 +36,7 @@ import (
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
"k8s.io/kubernetes/test/e2e/scheduling"
"k8s.io/utils/pointer"
"github.com/onsi/ginkgo"
@ -45,6 +47,10 @@ var _ = SIGDescribe("Job", func() {
f := framework.NewDefaultFramework("job")
parallelism := int32(2)
completions := int32(4)
largeParallelism := int32(90)
largeCompletions := int32(90)
backoffLimit := int32(6) // default value
// Simplest case: N pods succeed
@ -361,6 +367,52 @@ var _ = SIGDescribe("Job", func() {
framework.ExpectEqual(pod.Status.Phase, v1.PodFailed)
}
})
ginkgo.It("should run a job to completion with CPU requests [Serial]", func() {
ginkgo.By("Creating a job that with CPU requests")
testNodeName := scheduling.GetNodeThatCanRunPod(f)
targetNode, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), testNodeName, metav1.GetOptions{})
framework.ExpectNoError(err, "unable to get node object for node %v", testNodeName)
cpu, ok := targetNode.Status.Allocatable[v1.ResourceCPU]
if !ok {
framework.Failf("Unable to get node's %q cpu", targetNode.Name)
}
cpuRequest := fmt.Sprint(int64(0.2 * float64(cpu.Value())))
backoff := 0
ginkgo.By("Creating a job")
job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, largeParallelism, largeCompletions, nil, int32(backoff))
for i := range job.Spec.Template.Spec.Containers {
job.Spec.Template.Spec.Containers[i].Resources = v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse(cpuRequest),
},
}
job.Spec.Template.Spec.NodeSelector = map[string]string{"kubernetes.io/hostname": testNodeName}
}
framework.Logf("Creating job %q with a node hostname selector %q wth cpu request %q", job.Name, testNodeName, cpuRequest)
job, err = e2ejob.CreateJob(f.ClientSet, f.Namespace.Name, job)
framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
ginkgo.By("Ensuring job reaches completions")
err = e2ejob.WaitForJobComplete(f.ClientSet, f.Namespace.Name, job.Name, largeCompletions)
framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
ginkgo.By("Ensuring pods for job exist")
pods, err := e2ejob.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name)
successes := int32(0)
for _, pod := range pods.Items {
if pod.Status.Phase == v1.PodSucceeded {
successes++
}
}
framework.ExpectEqual(successes, largeCompletions, "expected %d successful job pods, but got %d", largeCompletions, successes)
})
})
// waitForJobFailure uses c to wait for up to timeout for the Job named jobName in namespace ns to fail.