From cefd95c51a3bae93a57fc1045fc5c8d0071501ee Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 18 Feb 2021 16:44:36 +0100
Subject: [PATCH] test: err more towards ignoring logs from terminated pods

Because log capturing can end due to an error and because a pod has
terminated, it is uncertain whether all log output has been
captured. So far, the code leaned more towards restarting log
capturing based on the rationale that duplicate logs are better than
no logs.

But this is confusing and potentially makes logs much larger, so now
an additional heuristic is used to avoid log capturing when logging
was started already and the pod itself is marked for deletion. That
occurs before the individual containers shut down and get marked as
terminated.
---
 test/e2e/storage/podlogs/podlogs.go | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/e2e/storage/podlogs/podlogs.go b/test/e2e/storage/podlogs/podlogs.go
index 1c69231a547..6e343979698 100644
--- a/test/e2e/storage/podlogs/podlogs.go
+++ b/test/e2e/storage/podlogs/podlogs.go
@@ -68,6 +68,17 @@ var expectedErrors = regexp.MustCompile(`container .* in pod .* is (terminated|w
 // would be a blocking function with collects logs from all currently
 // running pods, but that then would have the disadvantage that
 // already deleted pods aren't covered.
+//
+// Another race occurs is when a pod shuts down. Logging stops, but if
+// then the pod is not removed from the apiserver quickly enough, logging
+// resumes and dumps the old log again. Previously, this was allowed based
+// on the assumption that it is better to log twice than miss log messages
+// of pods that started and immediately terminated or when logging temporarily
+// stopped.
+//
+// But it turned out to be rather confusing, so now a heuristic is used: if
+// log output of a container was already captured, then capturing does not
+// resume if the pod is marked for deletion.
 func CopyAllLogs(ctx context.Context, cs clientset.Interface, ns string, to LogOutput) error {
 	watcher, err := cs.CoreV1().Pods(ns).Watch(context.TODO(), meta.ListOptions{})
 	if err != nil {
@@ -106,6 +117,9 @@ func CopyAllLogs(ctx context.Context, cs clientset.Interface, ns string, to LogO
 						// there cannot be any new output and we can ignore it.
 						(pod.Status.ContainerStatuses[i].State.Terminated != nil &&
 							started[id]) ||
+						// State.Terminated might not have been updated although the container already
+						// stopped running. Also check whether the pod is deleted.
+						(pod.DeletionTimestamp != nil && started[id]) ||
 						// Don't attempt to get logs for a container unless it is running or has terminated.
 						// Trying to get a log would just end up with an error that we would have to suppress.
 						(pod.Status.ContainerStatuses[i].State.Running == nil &&