mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-21 02:41:25 +00:00
Sort kubelet pods by their creation time
There is a corner case when blocking Pod termination via a lifecycle preStop hook, for example by using this StateFulSet: ```yaml apiVersion: apps/v1 kind: StatefulSet metadata: name: web spec: selector: matchLabels: app: ubi serviceName: "ubi" replicas: 1 template: metadata: labels: app: ubi spec: terminationGracePeriodSeconds: 1000 containers: - name: ubi image: ubuntu:22.04 command: ['sh', '-c', 'echo The app is running! && sleep 360000'] ports: - containerPort: 80 name: web lifecycle: preStop: exec: command: - /bin/sh - -c - 'echo aaa; trap : TERM INT; sleep infinity & wait' ``` After creation, downscaling, forced deletion and upscaling of the replica like this: ``` > kubectl apply -f sts.yml > kubectl scale sts web --replicas=0 > kubectl delete pod web-0 --grace-period=0 --force > kubectl scale sts web --replicas=1 ``` We will end up having two pods running by the container runtime, while the API only reports one: ``` > kubectl get pods NAME READY STATUS RESTARTS AGE web-0 1/1 Running 0 92s ``` ``` > sudo crictl pods POD ID CREATED STATE NAME NAMESPACE ATTEMPT RUNTIME e05bb7dbb7e44 12 minutes ago Ready web-0 default 0 (default) d90088614c73b 12 minutes ago Ready web-0 default 0 (default) ``` When now running `kubectl exec -it web-0 -- ps -ef`, there is a random chance that we hit the wrong container reporting the lifecycle command `/bin/sh -c echo aaa; trap : TERM INT; sleep infinity & wait`. This is caused by the container lookup via its name (and no podUID) at:02109414e8/pkg/kubelet/kubelet_pods.go (L1905-L1914)
And more specifiy by the conversion of the pod result map to a slice in `GetPods`:02109414e8/pkg/kubelet/kuberuntime/kuberuntime_manager.go (L407-L411)
We now solve that unexpected behavior by tracking the creation time of the pod and sorting the result based on that. This will cause to always match the most recently created pod. Signed-off-by: Sascha Grunert <sgrunert@redhat.com>
This commit is contained in:
parent
9b4b1c0e79
commit
b296f82c69
@ -169,6 +169,8 @@ type Pod struct {
|
||||
// The name and namespace of the pod, which is readable by human.
|
||||
Name string
|
||||
Namespace string
|
||||
// Creation timestamps of the Pod in nanoseconds.
|
||||
CreatedAt uint64
|
||||
// List of containers that belongs to this pod. It may contain only
|
||||
// running containers, or mixed with dead ones (when GetPods(true)).
|
||||
Containers []*Container
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
cadvisorapi "github.com/google/cadvisor/info/v1"
|
||||
@ -371,6 +372,7 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err
|
||||
continue
|
||||
}
|
||||
p.Sandboxes = append(p.Sandboxes, converted)
|
||||
p.CreatedAt = uint64(s.GetCreatedAt())
|
||||
}
|
||||
|
||||
containers, err := m.getKubeletContainers(all)
|
||||
@ -410,6 +412,15 @@ func (m *kubeGenericRuntimeManager) GetPods(all bool) ([]*kubecontainer.Pod, err
|
||||
result = append(result, pod)
|
||||
}
|
||||
|
||||
// There are scenarios where multiple pods are running in parallel having
|
||||
// the same name, because one of them have not been fully terminated yet.
|
||||
// To avoid unexpected behavior on container name based search (for example
|
||||
// by calling *Kubelet.findContainer() without specifying a pod ID), we now
|
||||
// return the list of pods ordered by their creation time.
|
||||
sort.SliceStable(result, func(i, j int) bool {
|
||||
return result[i].CreatedAt > result[j].CreatedAt
|
||||
})
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||
package kuberuntime
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"sort"
|
||||
@ -473,6 +474,7 @@ func TestGetPods(t *testing.T) {
|
||||
ID: types.UID("12345678"),
|
||||
Name: "foo",
|
||||
Namespace: "new",
|
||||
CreatedAt: uint64(fakeSandbox.CreatedAt),
|
||||
Containers: []*kubecontainer.Container{containers[0], containers[1]},
|
||||
Sandboxes: []*kubecontainer.Container{sandbox},
|
||||
},
|
||||
@ -486,6 +488,35 @@ func TestGetPods(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetPodsSorted(t *testing.T) {
|
||||
fakeRuntime, _, m, err := createTestRuntimeManager()
|
||||
assert.NoError(t, err)
|
||||
|
||||
pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "bar"}}
|
||||
|
||||
createdTimestamps := []uint64{10, 5, 20}
|
||||
fakeSandboxes := []*apitest.FakePodSandbox{}
|
||||
for i, createdAt := range createdTimestamps {
|
||||
pod.UID = types.UID(fmt.Sprint(i))
|
||||
fakeSandboxes = append(fakeSandboxes, makeFakePodSandbox(t, m, sandboxTemplate{
|
||||
pod: pod,
|
||||
createdAt: int64(createdAt),
|
||||
state: runtimeapi.PodSandboxState_SANDBOX_READY,
|
||||
}))
|
||||
}
|
||||
fakeRuntime.SetFakeSandboxes(fakeSandboxes)
|
||||
|
||||
actual, err := m.GetPods(false)
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.Len(t, actual, 3)
|
||||
|
||||
// Verify that the pods are sorted by their creation time (newest/biggest timestamp first)
|
||||
assert.Equal(t, uint64(createdTimestamps[2]), actual[0].CreatedAt)
|
||||
assert.Equal(t, uint64(createdTimestamps[0]), actual[1].CreatedAt)
|
||||
assert.Equal(t, uint64(createdTimestamps[1]), actual[2].CreatedAt)
|
||||
}
|
||||
|
||||
func TestKillPod(t *testing.T) {
|
||||
fakeRuntime, _, m, err := createTestRuntimeManager()
|
||||
assert.NoError(t, err)
|
||||
|
Loading…
Reference in New Issue
Block a user