From 5e1f5ca735d71670563970d97f89e4486c34fe48 Mon Sep 17 00:00:00 2001 From: lifupan Date: Tue, 21 May 2019 16:36:29 +0800 Subject: [PATCH 1/3] shimv2: fix the issue of passing the wrong container id It should pass the container id instead of sandbox id. Fixes:#1672 Signed-off-by: lifupan --- containerd-shim-v2/service.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containerd-shim-v2/service.go b/containerd-shim-v2/service.go index 383d7d1238..d479790e15 100644 --- a/containerd-shim-v2/service.go +++ b/containerd-shim-v2/service.go @@ -453,7 +453,7 @@ func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (_ *task } s.send(&eventstypes.TaskDelete{ - ContainerID: s.id, + ContainerID: c.id, Pid: s.pid, ExitStatus: c.exit, ExitedAt: c.exitTime, From 0d535f56e514b37a21aa5e70e2c1becba732f8f1 Mon Sep 17 00:00:00 2001 From: lifupan Date: Tue, 21 May 2019 16:53:43 +0800 Subject: [PATCH 2/3] shimv2: kill a container return directly once the container termianted According to CRI specs, kubelet will call StopPodSandbox() at least once before calling RemovePodSandbox, and this call is idempotent, and must not return an error if all relevant resources have already been reclaimed. And in that call it will send a SIGKILL signal first to try to stop the container, thus once the container has terminated, here should ignore this signal and return directly. Fixes:#1672 Signed-off-by: lifupan --- containerd-shim-v2/service.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/containerd-shim-v2/service.go b/containerd-shim-v2/service.go index d479790e15..782c4a87a9 100644 --- a/containerd-shim-v2/service.go +++ b/containerd-shim-v2/service.go @@ -677,6 +677,20 @@ func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (_ *ptypes.E return nil, err } + // According to CRI specs, kubelet will call StopPodSandbox() + // at least once before calling RemovePodSandbox, and this call + // is idempotent, and must not return an error if all relevant + // resources have already been reclaimed. And in that call it will + // send a SIGKILL signal first to try to stop the container, thus + // once the container has terminated, here should ignore this signal + // and return directly. + if signum == syscall.SIGKILL || signum == syscall.SIGTERM { + if c.status == task.StatusStopped { + logrus.WithField("sandbox", s.sandbox.ID()).WithField("Container", c.id).Debug("Container has already been stopped") + return empty, nil + } + } + processID := c.id if r.ExecID != "" { execs, err := c.getExec(r.ExecID) From f301c957f6316ed82c16025e9795e39c4aa4f9b0 Mon Sep 17 00:00:00 2001 From: lifupan Date: Wed, 22 May 2019 10:03:24 +0800 Subject: [PATCH 3/3] shimv2: shutdown the sandbox when sandbox container exited Kubelet would cleanup the pod cgroup resources and kill the processes in the pod cgroups when it detected all of the containers in a pod exited, thus shimv2 should close the hypervisor process once the podsandbox container exited, otherwise, the hypervisor process would be killed by kubelet and made shimv2 failed to shutdown the sandbox. Fixes:#1672 Signed-off-by: lifupan --- containerd-shim-v2/container.go | 2 -- containerd-shim-v2/delete.go | 22 +++++++++++----------- containerd-shim-v2/service.go | 19 +------------------ containerd-shim-v2/wait.go | 30 ++++++++++++++++++++++++------ 4 files changed, 36 insertions(+), 37 deletions(-) diff --git a/containerd-shim-v2/container.go b/containerd-shim-v2/container.go index 5fc252be69..9dffa2cfcf 100644 --- a/containerd-shim-v2/container.go +++ b/containerd-shim-v2/container.go @@ -6,7 +6,6 @@ package containerdshim import ( - "sync" "time" "github.com/containerd/containerd/api/types/task" @@ -31,7 +30,6 @@ type container struct { stderr string bundle string cType vc.ContainerType - mu sync.Mutex exit uint32 status task.Status terminal bool diff --git a/containerd-shim-v2/delete.go b/containerd-shim-v2/delete.go index 462749215e..59e7faad41 100644 --- a/containerd-shim-v2/delete.go +++ b/containerd-shim-v2/delete.go @@ -17,21 +17,21 @@ import ( ) func deleteContainer(ctx context.Context, s *service, c *container) error { - - status, err := s.sandbox.StatusContainer(c.id) - if err != nil { - return err - } - if status.State.State != types.StateStopped { - _, err = s.sandbox.StopContainer(c.id) + if !c.cType.IsSandbox() { + status, err := s.sandbox.StatusContainer(c.id) if err != nil { return err } - } + if status.State.State != types.StateStopped { + _, err = s.sandbox.StopContainer(c.id) + if err != nil { + return err + } + } - _, err = s.sandbox.DeleteContainer(c.id) - if err != nil { - return err + if _, err = s.sandbox.DeleteContainer(c.id); err != nil { + return err + } } // Run post-stop OCI hooks. diff --git a/containerd-shim-v2/service.go b/containerd-shim-v2/service.go index 782c4a87a9..912252ae4c 100644 --- a/containerd-shim-v2/service.go +++ b/containerd-shim-v2/service.go @@ -431,27 +431,10 @@ func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (_ *task } if r.ExecID == "" { - err = deleteContainer(ctx, s, c) - if err != nil { + if err = deleteContainer(ctx, s, c); err != nil { return nil, err } - // Take care of the use case where it is a sandbox. - // Right after the container representing the sandbox has - // been deleted, let's make sure we stop and delete the - // sandbox. - if c.cType.IsSandbox() { - if err = s.sandbox.Stop(); err != nil { - logrus.WithField("sandbox", s.sandbox.ID()).Error("failed to stop sandbox") - return nil, err - } - - if err = s.sandbox.Delete(); err != nil { - logrus.WithField("sandbox", s.sandbox.ID()).Error("failed to delete sandbox") - return nil, err - } - } - s.send(&eventstypes.TaskDelete{ ContainerID: c.id, Pid: s.pid, diff --git a/containerd-shim-v2/wait.go b/containerd-shim-v2/wait.go index 121cad47b5..d36408dd5b 100644 --- a/containerd-shim-v2/wait.go +++ b/containerd-shim-v2/wait.go @@ -42,23 +42,41 @@ func wait(s *service, c *container, execID string) (int32, error) { } timeStamp := time.Now() - c.mu.Lock() + + s.mu.Lock() if execID == "" { + // Take care of the use case where it is a sandbox. + // Right after the container representing the sandbox has + // been deleted, let's make sure we stop and delete the + // sandbox. + + if c.cType.IsSandbox() { + if err = s.sandbox.Stop(); err != nil { + logrus.WithField("sandbox", s.sandbox.ID()).Error("failed to stop sandbox") + } + + if err = s.sandbox.Delete(); err != nil { + logrus.WithField("sandbox", s.sandbox.ID()).Error("failed to delete sandbox") + } + } else { + if _, err = s.sandbox.StopContainer(c.id); err != nil { + logrus.WithError(err).WithField("container", c.id).Warn("stop container failed") + } + } c.status = task.StatusStopped c.exit = uint32(ret) c.exitTime = timeStamp + + c.exitCh <- uint32(ret) + } else { execs.status = task.StatusStopped execs.exitCode = ret execs.exitTime = timeStamp - } - c.mu.Unlock() - if execID == "" { - c.exitCh <- uint32(ret) - } else { execs.exitCh <- uint32(ret) } + s.mu.Unlock() go cReap(s, int(ret), c.id, execID, timeStamp)