Sort kubelet pods by their creation time

There is a corner case when blocking Pod termination via a lifecycle
preStop hook, for example by using this StateFulSet:

```yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: web
spec:
  selector:
    matchLabels:
      app: ubi
  serviceName: "ubi"
  replicas: 1
  template:
    metadata:
      labels:
        app: ubi
    spec:
      terminationGracePeriodSeconds: 1000
      containers:
      - name: ubi
        image: ubuntu:22.04
        command: ['sh', '-c', 'echo The app is running! && sleep 360000']
        ports:
        - containerPort: 80
          name: web
        lifecycle:
          preStop:
            exec:
              command:
              - /bin/sh
              - -c
              - 'echo aaa; trap : TERM INT; sleep infinity & wait'
```

After creation, downscaling, forced deletion and upscaling of the
replica like this:

```
> kubectl apply -f sts.yml
> kubectl scale sts web --replicas=0
> kubectl delete pod web-0 --grace-period=0 --force
> kubectl scale sts web --replicas=1
```

We will end up having two pods running by the container runtime, while
the API only reports one:

```
> kubectl get pods
NAME    READY   STATUS    RESTARTS   AGE
web-0   1/1     Running   0          92s
```

```
> sudo crictl pods
POD ID              CREATED              STATE     NAME     NAMESPACE     ATTEMPT     RUNTIME
e05bb7dbb7e44       12 minutes ago       Ready     web-0    default       0           (default)
d90088614c73b       12 minutes ago       Ready     web-0    default       0           (default)
```

When now running `kubectl exec -it web-0 -- ps -ef`, there is a random chance that we hit the wrong
container reporting the lifecycle command `/bin/sh -c echo aaa; trap : TERM INT; sleep infinity & wait`.

This is caused by the container lookup via its name (and no podUID) at:
02109414e8/pkg/kubelet/kubelet_pods.go (L1905-L1914)

And more specifiy by the conversion of the pod result map to a slice in `GetPods`:
02109414e8/pkg/kubelet/kuberuntime/kuberuntime_manager.go (L407-L411)

We now solve that unexpected behavior by tracking the creation time of
the pod and sorting the result based on that. This will cause to always
match the most recently created pod.

Signed-off-by: Sascha Grunert <sgrunert@redhat.com>
This commit is contained in:
Sascha Grunert 2022-10-13 12:21:30 +02:00
parent 9b4b1c0e79
commit b296f82c69
No known key found for this signature in database
GPG Key ID: 09D97D153EF94D93
3 changed files with 44 additions and 0 deletions

View File

@ -169,6 +169,8 @@ type Pod struct {
// The name and namespace of the pod, which is readable by human.
Name string
Namespace string
// Creation timestamps of the Pod in nanoseconds.
CreatedAt uint64
// List of containers that belongs to this pod. It may contain only
// running containers, or mixed with dead ones (when GetPods(true)).
Containers []*Container

View File

@ -21,6 +21,7 @@ import (
"fmt"
"os"
"path/filepath"
"sort"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
@ -371,6 +372,7 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err
continue
}
p.Sandboxes = append(p.Sandboxes, converted)
p.CreatedAt = uint64(s.GetCreatedAt())
}
containers, err := m.getKubeletContainers(all)
@ -410,6 +412,15 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err
result = append(result, pod)
}
// There are scenarios where multiple pods are running in parallel having
// the same name, because one of them have not been fully terminated yet.
// To avoid unexpected behavior on container name based search (for example
// by calling *Kubelet.findContainer() without specifying a pod ID), we now
// return the list of pods ordered by their creation time.
sort.SliceStable(result, func(i, j int) bool {
return result[i].CreatedAt > result[j].CreatedAt
})
return result, nil
}

View File

@ -17,6 +17,7 @@ limitations under the License.
package kuberuntime
import (
"fmt"
"path/filepath"
"reflect"
"sort"
@ -473,6 +474,7 @@ func TestGetPods(t *testing.T) {
ID: types.UID("12345678"),
Name: "foo",
Namespace: "new",
CreatedAt: uint64(fakeSandbox.CreatedAt),
Containers: []*kubecontainer.Container{containers[0], containers[1]},
Sandboxes: []*kubecontainer.Container{sandbox},
},
@ -486,6 +488,35 @@ func TestGetPods(t *testing.T) {
}
}
func TestGetPodsSorted(t *testing.T) {
fakeRuntime, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)
pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "bar"}}
createdTimestamps := []uint64{10, 5, 20}
fakeSandboxes := []*apitest.FakePodSandbox{}
for i, createdAt := range createdTimestamps {
pod.UID = types.UID(fmt.Sprint(i))
fakeSandboxes = append(fakeSandboxes, makeFakePodSandbox(t, m, sandboxTemplate{
pod: pod,
createdAt: int64(createdAt),
state: runtimeapi.PodSandboxState_SANDBOX_READY,
}))
}
fakeRuntime.SetFakeSandboxes(fakeSandboxes)
actual, err := m.GetPods(false)
assert.NoError(t, err)
assert.Len(t, actual, 3)
// Verify that the pods are sorted by their creation time (newest/biggest timestamp first)
assert.Equal(t, uint64(createdTimestamps[2]), actual[0].CreatedAt)
assert.Equal(t, uint64(createdTimestamps[0]), actual[1].CreatedAt)
assert.Equal(t, uint64(createdTimestamps[1]), actual[2].CreatedAt)
}
func TestKillPod(t *testing.T) {
fakeRuntime, _, m, err := createTestRuntimeManager()
assert.NoError(t, err)