e2e: node: add code to track the pod creator code

e2e_node test depend on very specific shared state (node state). Pod leakages between tests oftentimes cause the test preconditions to be silently corrupted, causing hard to debug CI failures. We add the option to add an annotation to pods which records the code line (source code:line) which triggered the pod creation, so it becomes easier to track which test needs better cleanup. The relevant e2e framework code is used in all the e2e suites, so to minimize any unwanted consequences we make the feature opt-in, planning to enable it initially (and likely only) in the e2e_node tests. Signed-off-by: Francesco Romani <fromani@redhat.com>
2025-09-14 21:53:52 +00:00 · 2024-06-11 09:07:21 +02:00
parent 9d63e575f8
commit 7160ef4fbc
1 changed files with 52 additions and 7 deletions
--- a/test/e2e/framework/pod/pod_client.go
+++ b/test/e2e/framework/pod/pod_client.go
@@ -36,6 +36,7 @@ import (
 	"k8s.io/kubectl/pkg/util/podutils"

 	"github.com/onsi/ginkgo/v2"
+	ginkgotypes "github.com/onsi/ginkgo/v2/types"
 	"github.com/onsi/gomega"

 	"k8s.io/kubernetes/test/e2e/framework"
@@ -56,6 +57,19 @@ const (

 	// it is copied from k8s.io/kubernetes/pkg/kubelet/sysctl
 	forbiddenReason = "SysctlForbidden"
+
+	// which test created this pod?
+	AnnotationTestOwner = "owner.test"
+)
+
+// global flags so we can enable features per-suite instead of per-client.
+var (
+	// GlobalOwnerTracking controls if newly created PodClients should automatically annotate
+	// the pod with the owner test. The owner test is identified by "sourcecodepath:linenumber".
+	// Annotating the pods this way is useful to troubleshoot tests which do insufficient cleanup.
+	// Default is false to maximize backward compatibility.
+	// See also: WithOwnerTracking, AnnotationTestOwner
+	GlobalOwnerTracking bool
 )

 // ImagePrePullList is the images used in the current test suite. It should be initialized in test suite and
@@ -68,9 +82,10 @@ var ImagePrePullList sets.String
 // node e2e pod scheduling.
 func NewPodClient(f *framework.Framework) *PodClient {
 	return &PodClient{
-		f:            f,
-		PodInterface: f.ClientSet.CoreV1().Pods(f.Namespace.Name),
-		namespace:    f.Namespace.Name,
+		f:             f,
+		PodInterface:  f.ClientSet.CoreV1().Pods(f.Namespace.Name),
+		namespace:     f.Namespace.Name,
+		ownerTracking: GlobalOwnerTracking,
 	}
 }

@@ -79,9 +94,10 @@ func NewPodClient(f *framework.Framework) *PodClient {
 // node e2e pod scheduling.
 func PodClientNS(f *framework.Framework, namespace string) *PodClient {
 	return &PodClient{
-		f:            f,
-		PodInterface: f.ClientSet.CoreV1().Pods(namespace),
-		namespace:    namespace,
+		f:             f,
+		PodInterface:  f.ClientSet.CoreV1().Pods(namespace),
+		namespace:     namespace,
+		ownerTracking: GlobalOwnerTracking,
 	}
 }

@@ -89,19 +105,34 @@ func PodClientNS(f *framework.Framework, namespace string) *PodClient {
 type PodClient struct {
 	f *framework.Framework
 	v1core.PodInterface
-	namespace string
+	namespace     string
+	ownerTracking bool
+}
+
+// WithOwnerTracking controls automatic add of annotations recording the code location
+// which created a pod. This is helpful when troubleshooting e2e tests (like e2e_node)
+// which leak pods because insufficient cleanup.
+// Note we want a shallow clone to avoid mutating the receiver.
+// The default is the value of GlobalOwnerTracking *when the client was created*.
+func (c PodClient) WithOwnerTracking(value bool) *PodClient {
+	c.ownerTracking = value
+	return &c
 }

 // Create creates a new pod according to the framework specifications (don't wait for it to start).
 func (c *PodClient) Create(ctx context.Context, pod *v1.Pod) *v1.Pod {
+	ginkgo.GinkgoHelper()
 	c.mungeSpec(pod)
+	c.setOwnerAnnotation(pod)
 	p, err := c.PodInterface.Create(ctx, pod, metav1.CreateOptions{})
 	framework.ExpectNoError(err, "Error creating Pod")
 	return p
+
 }

 // CreateSync creates a new pod according to the framework specifications, and wait for it to start and be running and ready.
 func (c *PodClient) CreateSync(ctx context.Context, pod *v1.Pod) *v1.Pod {
+	ginkgo.GinkgoHelper()
 	p := c.Create(ctx, pod)
 	framework.ExpectNoError(WaitTimeoutForPodReadyInNamespace(ctx, c.f.ClientSet, p.Name, c.namespace, framework.PodStartTimeout))
 	// Get the newest pod after it becomes running and ready, some status may change after pod created, such as pod ip.
@@ -112,6 +143,7 @@ func (c *PodClient) CreateSync(ctx context.Context, pod *v1.Pod) *v1.Pod {

 // CreateBatch create a batch of pods. All pods are created before waiting.
 func (c *PodClient) CreateBatch(ctx context.Context, pods []*v1.Pod) []*v1.Pod {
+	ginkgo.GinkgoHelper()
 	ps := make([]*v1.Pod, len(pods))
 	var wg sync.WaitGroup
 	for i, pod := range pods {
@@ -192,6 +224,19 @@ func (c *PodClient) DeleteSync(ctx context.Context, name string, options metav1.
 	framework.ExpectNoError(WaitForPodNotFoundInNamespace(ctx, c.f.ClientSet, name, c.namespace, timeout), "wait for pod %q to disappear", name)
 }

+// addTestOrigin adds annotations to help identifying tests which incorrectly leak pods because insufficient cleanup
+func (c *PodClient) setOwnerAnnotation(pod *v1.Pod) {
+	if !c.ownerTracking {
+		return
+	}
+	ginkgo.GinkgoHelper()
+	location := ginkgotypes.NewCodeLocation(0)
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	}
+	pod.Annotations[AnnotationTestOwner] = fmt.Sprintf("%s:%d", location.FileName, location.LineNumber)
+}
+
 // mungeSpec apply test-suite specific transformations to the pod spec.
 func (c *PodClient) mungeSpec(pod *v1.Pod) {
 	if !framework.TestContext.NodeE2E {