Merge pull request #129845 from bitoku/fix-flake

Reduce the number of processes used in e2e to prevent unexpected OOM
2025-07-25 12:43:23 +00:00 · 2025-02-12 13:16:20 -08:00 · 2025-02-12 13:16:20 -08:00 · 5d57d0c110
commit 5d57d0c110
parent 5e1c31b9db dbb34a04cc
1 changed files with 9 additions and 5 deletions
--- a/test/e2e_node/eviction_test.go
+++ b/test/e2e_node/eviction_test.go
@ -473,13 +473,16 @@ var _ = SIGDescribe("PriorityPidEvictionOrdering", framework.WithSlow(), framewo

 	highPriorityClassName := f.BaseName + "-high-priority"
 	highPriority := int32(999999999)
-	processes := 30000
+	// Apparently there is a threshold at around 10,000+. If it's over the threshold,
+	// the processes are likely to be oom-killed instead of evicted.
+	// One test can have at most two pidConsumingPods at a time not to cause oom-kill.
+	processes := 5000

 	// if criStats is true, PodAndContainerStatsFromCRI will use data from cri instead of cadvisor for kubelet to get pid count of pods
 	for _, criStats := range []bool{true, false} {
 		ginkgo.Context(fmt.Sprintf("when we run containers with PodAndContainerStatsFromCRI=%v that should cause %s", criStats, expectedNodeCondition), func() {
 			tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
-				pidsConsumed := int64(10000)
+				pidsConsumed := int64(4000)
 				summary := eventuallyGetSummary(ctx)
 				availablePids := *(summary.Node.Rlimit.MaxPID) - *(summary.Node.Rlimit.NumOfRunningProcesses)
 				initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalPIDAvailable): fmt.Sprintf("%d", availablePids-pidsConsumed)}
@ -524,7 +527,7 @@ var _ = SIGDescribe("PriorityPidEvictionOrdering", framework.WithSlow(), framewo

 	f.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition)+"; baseline scenario to verify DisruptionTarget is added", func() {
 		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
-			pidsConsumed := int64(10000)
+			pidsConsumed := int64(4000)
 			summary := eventuallyGetSummary(ctx)
 			availablePids := *(summary.Node.Rlimit.MaxPID) - *(summary.Node.Rlimit.NumOfRunningProcesses)
 			initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalPIDAvailable): fmt.Sprintf("%d", availablePids-pidsConsumed)}
@ -1049,8 +1052,9 @@ func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSo
 }

 func pidConsumingPod(name string, numProcesses int) *v1.Pod {
-	// Each iteration forks once, but creates two processes
-	return podWithCommand(nil, v1.ResourceRequirements{}, numProcesses/2, name, "(while true; do /bin/sleep 5; done)&")
+	// Slowing down the iteration speed to prevent a race condition where eviction may occur
+	// before the correct number of processes is captured in the stats during a sudden surge in processes.
+	return podWithCommand(nil, v1.ResourceRequirements{}, numProcesses, name, "/bin/sleep 0.01; (/bin/sleep 3600)&")
 }

 // podWithCommand returns a pod with the provided volumeSource and resourceRequirements.