From 1715fc0ca06d4d5b9d33b9cef0816c7d6ac8e700 Mon Sep 17 00:00:00 2001 From: Sai Ramesh Vanka Date: Tue, 17 Oct 2023 13:07:01 +0530 Subject: [PATCH] Fix issue in enabling evented pleg feature gate Fixes https://github.com/kubernetes/kubernetes/issues/120941 GetNewerThan() call isn't blocking until the pod status/cache is updated and returning the empty pod status. Hence, whenever the `SyncLoop ADD/UPDATE/RECONCILE` functions are called multiple times in a very less time interval, Kubelet calls multiple `CreateContainer` CRI api that results in the creation of duplicate containers within a given pod. The initially created conainer keeps `Running` and the later container keeps `Exiting` and hence resulting the pod in `CrashLoopBackOff` state forever Signed-off-by: Sai Ramesh Vanka --- pkg/kubelet/container/cache.go | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/pkg/kubelet/container/cache.go b/pkg/kubelet/container/cache.go index a43f07b31ab..9b4138634d3 100644 --- a/pkg/kubelet/container/cache.go +++ b/pkg/kubelet/container/cache.go @@ -157,29 +157,6 @@ func (c *cache) get(id types.UID) *data { // Otherwise, it returns nil. The caller should acquire the lock. func (c *cache) getIfNewerThan(id types.UID, minTime time.Time) *data { d, ok := c.pods[id] - if utilfeature.DefaultFeatureGate.Enabled(features.EventedPLEG) { - // Evented PLEG has CREATED, STARTED, STOPPED and DELETED events - // However if the container creation fails for some reason there is no - // CRI event received by the kubelet and that pod will get stuck a - // GetNewerThan call in the pod workers. This is reproducible with - // the node e2e test, - // https://github.com/kubernetes/kubernetes/blob/83415e5c9e6e59a3d60a148160490560af2178a1/test/e2e_node/pod_hostnamefqdn_test.go#L161 - // which forces failure during pod creation. This issue also exists in - // Generic PLEG but since it updates global timestamp periodically - // the GetNewerThan call gets unstuck. - - // During node e2e tests, it was observed this change does not have any - // adverse impact on the behaviour of the Generic PLEG as well. - switch { - case !ok: - return makeDefaultData(id) - case ok && (d.modified.After(minTime) || (c.timestamp != nil && c.timestamp.After(minTime))): - return d - default: - return nil - } - } - globalTimestampIsNewer := (c.timestamp != nil && c.timestamp.After(minTime)) if !ok && globalTimestampIsNewer { // Status is not cached, but the global timestamp is newer than