From 3946d99904fe37ea04b231a8d101085b9b80b221 Mon Sep 17 00:00:00 2001 From: KeZhang Date: Thu, 20 Jan 2022 18:35:31 +0800 Subject: [PATCH] Ignore container notfound error while getPodstatuses --- .../kuberuntime/kuberuntime_container.go | 25 ++++-- .../kuberuntime/kuberuntime_manager.go | 15 +++- .../kuberuntime/kuberuntime_manager_test.go | 79 +++++++++++++++++++ vendor/modules.txt | 1 + 4 files changed, 108 insertions(+), 12 deletions(-) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index ddeb6a07dcd..680eab7a56e 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -34,6 +34,8 @@ import ( "sync" "time" + crierror "k8s.io/cri-api/pkg/errors" + "github.com/opencontainers/selinux/go-selinux" grpcstatus "google.golang.org/grpc/status" @@ -501,10 +503,17 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n return nil, err } - statuses := make([]*kubecontainer.Status, len(containers)) + statuses := []*kubecontainer.Status{} // TODO: optimization: set maximum number of containers per container name to examine. - for i, c := range containers { + for _, c := range containers { resp, err := m.runtimeService.ContainerStatus(c.Id, false) + // Between List (ListContainers) and check (ContainerStatus) another thread might remove a container, and that is normal. + // The previous call (ListContainers) never fails due to a pod container not existing. + // Therefore, this method should not either, but instead act as if the previous call failed, + // which means the error should be ignored. + if crierror.IsNotFound(err) { + continue + } if err != nil { // Merely log this here; GetPodStatus will actually report the error out. klog.V(4).InfoS("ContainerStatus return error", "containerID", c.Id, "err", err) @@ -533,7 +542,7 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n cStatus.Message += tMessage } } - statuses[i] = cStatus + statuses = append(statuses, cStatus) } sort.Sort(containerStatusByCreated(statuses)) @@ -714,15 +723,15 @@ func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubec "containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod) err := m.runtimeService.StopContainer(containerID.ID, gracePeriod) - if err != nil { + if err != nil && !crierror.IsNotFound(err) { klog.ErrorS(err, "Container termination failed with gracePeriod", "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", containerName, "containerID", containerID.String(), "gracePeriod", gracePeriod) - } else { - klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID, - "containerName", containerName, "containerID", containerID.String()) + return err } + klog.V(3).InfoS("Container exited normally", "pod", klog.KObj(pod), "podUID", pod.UID, + "containerName", containerName, "containerID", containerID.String()) - return err + return nil } // killContainersWithSyncResult kills all pod's containers with sync results. diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 81467571a02..5a1df82b2dd 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -24,6 +24,7 @@ import ( "time" cadvisorapi "github.com/google/cadvisor/info/v1" + crierror "k8s.io/cri-api/pkg/errors" "k8s.io/klog/v2" v1 "k8s.io/api/core/v1" @@ -991,7 +992,7 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo result.AddSyncResult(killSandboxResult) // Stop all sandboxes belongs to same pod for _, podSandbox := range runningPod.Sandboxes { - if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil { + if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil && !crierror.IsNotFound(err) { killSandboxResult.Fail(kubecontainer.ErrKillPodSandbox, err.Error()) klog.ErrorS(nil, "Failed to stop sandbox", "podSandboxID", podSandbox.ID) } @@ -1033,10 +1034,17 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp klog.V(4).InfoS("getSandboxIDByPodUID got sandbox IDs for pod", "podSandboxID", podSandboxIDs, "pod", klog.KObj(pod)) - sandboxStatuses := make([]*runtimeapi.PodSandboxStatus, len(podSandboxIDs)) + sandboxStatuses := []*runtimeapi.PodSandboxStatus{} podIPs := []string{} for idx, podSandboxID := range podSandboxIDs { resp, err := m.runtimeService.PodSandboxStatus(podSandboxID, false) + // Between List (getSandboxIDByPodUID) and check (PodSandboxStatus) another thread might remove a container, and that is normal. + // The previous call (getSandboxIDByPodUID) never fails due to a pod sandbox not existing. + // Therefore, this method should not either, but instead act as if the previous call failed, + // which means the error should be ignored. + if crierror.IsNotFound(err) { + continue + } if err != nil { klog.ErrorS(err, "PodSandboxStatus of sandbox for pod", "podSandboxID", podSandboxID, "pod", klog.KObj(pod)) return nil, err @@ -1045,8 +1053,7 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp return nil, errors.New("pod sandbox status is nil") } - sandboxStatuses[idx] = resp.Status - + sandboxStatuses = append(sandboxStatuses, resp.Status) // Only get pod IP from latest sandbox if idx == 0 && resp.Status.State == runtimeapi.PodSandboxState_SANDBOX_READY { podIPs = m.determinePodSandboxIPs(namespace, name, resp.Status) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go index 49bfeeeb132..8bec90c8f84 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager_test.go @@ -23,6 +23,9 @@ import ( "testing" "time" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + cadvisorapi "github.com/google/cadvisor/info/v1" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -334,6 +337,82 @@ func TestGetPodStatus(t *testing.T) { assert.Equal(t, apitest.FakePodSandboxIPs, podStatus.IPs) } +func TestStopContainerWithNotFoundError(t *testing.T) { + fakeRuntime, _, m, err := createTestRuntimeManager() + assert.NoError(t, err) + + containers := []v1.Container{ + { + Name: "foo1", + Image: "busybox", + ImagePullPolicy: v1.PullIfNotPresent, + }, + { + Name: "foo2", + Image: "busybox", + ImagePullPolicy: v1.PullIfNotPresent, + }, + } + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "foo", + Namespace: "new", + }, + Spec: v1.PodSpec{ + Containers: containers, + }, + } + + // Set fake sandbox and faked containers to fakeRuntime. + makeAndSetFakePod(t, m, fakeRuntime, pod) + fakeRuntime.InjectError("StopContainer", status.Error(codes.NotFound, "No such container")) + podStatus, err := m.GetPodStatus(pod.UID, pod.Name, pod.Namespace) + require.NoError(t, err) + p := kubecontainer.ConvertPodStatusToRunningPod("", podStatus) + gracePeriod := int64(1) + err = m.KillPod(pod, p, &gracePeriod) + require.NoError(t, err) +} + +func TestGetPodStatusWithNotFoundError(t *testing.T) { + fakeRuntime, _, m, err := createTestRuntimeManager() + assert.NoError(t, err) + + containers := []v1.Container{ + { + Name: "foo1", + Image: "busybox", + ImagePullPolicy: v1.PullIfNotPresent, + }, + { + Name: "foo2", + Image: "busybox", + ImagePullPolicy: v1.PullIfNotPresent, + }, + } + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "12345678", + Name: "foo", + Namespace: "new", + }, + Spec: v1.PodSpec{ + Containers: containers, + }, + } + + // Set fake sandbox and faked containers to fakeRuntime. + makeAndSetFakePod(t, m, fakeRuntime, pod) + fakeRuntime.InjectError("ContainerStatus", status.Error(codes.NotFound, "No such container")) + podStatus, err := m.GetPodStatus(pod.UID, pod.Name, pod.Namespace) + require.NoError(t, err) + require.Equal(t, pod.UID, podStatus.ID) + require.Equal(t, pod.Name, podStatus.Name) + require.Equal(t, pod.Namespace, podStatus.Namespace) + require.Equal(t, apitest.FakePodSandboxIPs, podStatus.IPs) +} + func TestGetPods(t *testing.T) { fakeRuntime, _, m, err := createTestRuntimeManager() assert.NoError(t, err) diff --git a/vendor/modules.txt b/vendor/modules.txt index a106722a6cb..2e1468762f0 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1998,6 +1998,7 @@ k8s.io/cri-api/pkg/apis k8s.io/cri-api/pkg/apis/runtime/v1 k8s.io/cri-api/pkg/apis/runtime/v1alpha2 k8s.io/cri-api/pkg/apis/testing +k8s.io/cri-api/pkg/errors # k8s.io/csi-translation-lib v0.0.0 => ./staging/src/k8s.io/csi-translation-lib ## explicit k8s.io/csi-translation-lib