From b296f82c69feaab07e93a6db62e52ce8b0991cd0 Mon Sep 17 00:00:00 2001 From: Sascha Grunert Date: Thu, 13 Oct 2022 12:21:30 +0200 Subject: [PATCH] Sort kubelet pods by their creation time There is a corner case when blocking Pod termination via a lifecycle preStop hook, for example by using this StateFulSet: ```yaml apiVersion: apps/v1 kind: StatefulSet metadata: name: web spec: selector: matchLabels: app: ubi serviceName: "ubi" replicas: 1 template: metadata: labels: app: ubi spec: terminationGracePeriodSeconds: 1000 containers: - name: ubi image: ubuntu:22.04 command: ['sh', '-c', 'echo The app is running! && sleep 360000'] ports: - containerPort: 80 name: web lifecycle: preStop: exec: command: - /bin/sh - -c - 'echo aaa; trap : TERM INT; sleep infinity & wait' ``` After creation, downscaling, forced deletion and upscaling of the replica like this: ``` > kubectl apply -f sts.yml > kubectl scale sts web --replicas=0 > kubectl delete pod web-0 --grace-period=0 --force > kubectl scale sts web --replicas=1 ``` We will end up having two pods running by the container runtime, while the API only reports one: ``` > kubectl get pods NAME READY STATUS RESTARTS AGE web-0 1/1 Running 0 92s ``` ``` > sudo crictl pods POD ID CREATED STATE NAME NAMESPACE ATTEMPT RUNTIME e05bb7dbb7e44 12 minutes ago Ready web-0 default 0 (default) d90088614c73b 12 minutes ago Ready web-0 default 0 (default) ``` When now running `kubectl exec -it web-0 -- ps -ef`, there is a random chance that we hit the wrong container reporting the lifecycle command `/bin/sh -c echo aaa; trap : TERM INT; sleep infinity & wait`. This is caused by the container lookup via its name (and no podUID) at: https://github.com/kubernetes/kubernetes/blob/02109414e8816ceefce775e8b627d67cad6ced85/pkg/kubelet/kubelet_pods.go#L1905-L1914 And more specifiy by the conversion of the pod result map to a slice in `GetPods`: https://github.com/kubernetes/kubernetes/blob/02109414e8816ceefce775e8b627d67cad6ced85/pkg/kubelet/kuberuntime/kuberuntime_manager.go#L407-L411 We now solve that unexpected behavior by tracking the creation time of the pod and sorting the result based on that. This will cause to always match the most recently created pod. Signed-off-by: Sascha Grunert --- pkg/kubelet/container/runtime.go | 2 ++ .../kuberuntime/kuberuntime_manager.go | 11 +++++++ .../kuberuntime/kuberuntime_manager_test.go | 31 +++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index ad59de4df1c..6f7b76783ab 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -169,6 +169,8 @@ type Pod struct { // The name and namespace of the pod, which is readable by human. Name string Namespace string + // Creation timestamps of the Pod in nanoseconds. + CreatedAt uint64 // List of containers that belongs to this pod. It may contain only // running containers, or mixed with dead ones (when GetPods(true)). Containers []*Container diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 89eaf8b4fcd..5c1e565c444 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -21,6 +21,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "time" cadvisorapi "github.com/google/cadvisor/info/v1" @@ -371,6 +372,7 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err continue } p.Sandboxes = append(p.Sandboxes, converted) + p.CreatedAt = uint64(s.GetCreatedAt()) } containers, err := m.getKubeletContainers(all) @@ -410,6 +412,15 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err result = append(result, pod) } + // There are scenarios where multiple pods are running in parallel having + // the same name, because one of them have not been fully terminated yet. + // To avoid unexpected behavior on container name based search (for example + // by calling *Kubelet.findContainer() without specifying a pod ID), we now + // return the list of pods ordered by their creation time. + sort.SliceStable(result, func(i, j int) bool { + return result[i].CreatedAt > result[j].CreatedAt + }) + return result, nil } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go index 8e2daef2810..5343d80dd41 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go @@ -17,6 +17,7 @@ limitations under the License. package kuberuntime import ( + "fmt" "path/filepath" "reflect" "sort" @@ -473,6 +474,7 @@ func TestGetPods(t *testing.T) { ID: types.UID("12345678"), Name: "foo", Namespace: "new", + CreatedAt: uint64(fakeSandbox.CreatedAt), Containers: []*kubecontainer.Container{containers[0], containers[1]}, Sandboxes: []*kubecontainer.Container{sandbox}, }, @@ -486,6 +488,35 @@ func TestGetPods(t *testing.T) { } } +func TestGetPodsSorted(t *testing.T) { + fakeRuntime, _, m, err := createTestRuntimeManager() + assert.NoError(t, err) + + pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "bar"}} + + createdTimestamps := []uint64{10, 5, 20} + fakeSandboxes := []*apitest.FakePodSandbox{} + for i, createdAt := range createdTimestamps { + pod.UID = types.UID(fmt.Sprint(i)) + fakeSandboxes = append(fakeSandboxes, makeFakePodSandbox(t, m, sandboxTemplate{ + pod: pod, + createdAt: int64(createdAt), + state: runtimeapi.PodSandboxState_SANDBOX_READY, + })) + } + fakeRuntime.SetFakeSandboxes(fakeSandboxes) + + actual, err := m.GetPods(false) + assert.NoError(t, err) + + assert.Len(t, actual, 3) + + // Verify that the pods are sorted by their creation time (newest/biggest timestamp first) + assert.Equal(t, uint64(createdTimestamps[2]), actual[0].CreatedAt) + assert.Equal(t, uint64(createdTimestamps[0]), actual[1].CreatedAt) + assert.Equal(t, uint64(createdTimestamps[1]), actual[2].CreatedAt) +} + func TestKillPod(t *testing.T) { fakeRuntime, _, m, err := createTestRuntimeManager() assert.NoError(t, err)