From f886c0bf3512383d7d4466a0c4e2bda89c174539 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 18 Jul 2019 21:04:29 -0700 Subject: [PATCH 1/9] vc: drop container SetPid API It is not used by anyone. Signed-off-by: Peng Tao --- virtcontainers/container.go | 10 ---------- virtcontainers/documentation/api/1.0/api.md | 1 - virtcontainers/interfaces.go | 1 - virtcontainers/pkg/vcmock/container.go | 6 ------ 4 files changed, 18 deletions(-) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 4ae6754333..e41db846da 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -368,16 +368,6 @@ func (c *Container) GetPid() int { return c.process.Pid } -// SetPid sets and stores the given pid as the pid of container's process. -func (c *Container) SetPid(pid int) error { - c.process.Pid = pid - - if !c.sandbox.supportNewStore() { - return c.storeProcess() - } - return nil -} - func (c *Container) setStateFstype(fstype string) error { c.state.Fstype = fstype diff --git a/virtcontainers/documentation/api/1.0/api.md b/virtcontainers/documentation/api/1.0/api.md index c0051e56a9..4753f5c9d1 100644 --- a/virtcontainers/documentation/api/1.0/api.md +++ b/virtcontainers/documentation/api/1.0/api.md @@ -773,7 +773,6 @@ type VCContainer interface { ID() string Sandbox() VCSandbox Process() Process - SetPid(pid int) error } ``` diff --git a/virtcontainers/interfaces.go b/virtcontainers/interfaces.go index d62389d7f0..8ce0e22fa3 100644 --- a/virtcontainers/interfaces.go +++ b/virtcontainers/interfaces.go @@ -110,5 +110,4 @@ type VCContainer interface { ID() string Sandbox() VCSandbox Process() Process - SetPid(pid int) error } diff --git a/virtcontainers/pkg/vcmock/container.go b/virtcontainers/pkg/vcmock/container.go index d93c271811..63fcc8a02b 100644 --- a/virtcontainers/pkg/vcmock/container.go +++ b/virtcontainers/pkg/vcmock/container.go @@ -38,12 +38,6 @@ func (c *Container) GetPid() int { return c.MockPid } -// SetPid implements the VCContainer function of the same name. -func (c *Container) SetPid(pid int) error { - c.MockPid = pid - return nil -} - // GetAnnotations implements the VCContainer function of the same name. func (c *Container) GetAnnotations() map[string]string { return c.MockAnnotations From c472a01006553ea4b314b7f71b194e17297d3dd6 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 18 Jul 2019 21:05:21 -0700 Subject: [PATCH 2/9] container: allow to stop a paused container When a container is paused and something goes terribly wrong, we still need to be able to clean thing up. A paused container should be able to transit to stopped state as well so that we can delete it properly. Signed-off-by: Peng Tao --- cli/delete_test.go | 6 +++--- cli/kill_test.go | 2 +- virtcontainers/container.go | 4 ---- virtcontainers/pkg/vcmock/mock.go | 4 ++-- virtcontainers/pkg/vcmock/mock_test.go | 8 ++++---- virtcontainers/pkg/vcmock/types.go | 2 +- 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/cli/delete_test.go b/cli/delete_test.go index 97768e1b44..77ded07f5e 100644 --- a/cli/delete_test.go +++ b/cli/delete_test.go @@ -200,7 +200,7 @@ func TestDeleteSandbox(t *testing.T) { assert.Error(err) assert.True(vcmock.IsMockError(err)) - testingImpl.StopSandboxFunc = func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) { + testingImpl.StopSandboxFunc = func(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) { return sandbox, nil } @@ -310,7 +310,7 @@ func TestDeleteSandboxRunning(t *testing.T) { }, nil } - testingImpl.StopSandboxFunc = func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) { + testingImpl.StopSandboxFunc = func(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) { return sandbox, nil } @@ -564,7 +564,7 @@ func TestDeleteCLIFunctionSuccess(t *testing.T) { }, nil } - testingImpl.StopSandboxFunc = func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) { + testingImpl.StopSandboxFunc = func(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) { return sandbox, nil } diff --git a/cli/kill_test.go b/cli/kill_test.go index cd8910cf99..a80bc2673d 100644 --- a/cli/kill_test.go +++ b/cli/kill_test.go @@ -29,7 +29,7 @@ var ( return &vcmock.Container{}, nil } - testStopSandboxFuncReturnNil = func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) { + testStopSandboxFuncReturnNil = func(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) { return &vcmock.Sandbox{}, nil } ) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index e41db846da..a8e7dc91c4 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -971,10 +971,6 @@ func (c *Container) stop() error { return nil } - if c.sandbox.state.State != types.StateReady && c.sandbox.state.State != types.StateRunning { - return fmt.Errorf("Sandbox not ready or running, impossible to stop the container") - } - if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil { return err } diff --git a/virtcontainers/pkg/vcmock/mock.go b/virtcontainers/pkg/vcmock/mock.go index b2455850fb..4b60575b9e 100644 --- a/virtcontainers/pkg/vcmock/mock.go +++ b/virtcontainers/pkg/vcmock/mock.go @@ -84,9 +84,9 @@ func (m *VCMock) StartSandbox(ctx context.Context, sandboxID string) (vc.VCSandb } // StopSandbox implements the VC function of the same name. -func (m *VCMock) StopSandbox(ctx context.Context, sandboxID string) (vc.VCSandbox, error) { +func (m *VCMock) StopSandbox(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) { if m.StopSandboxFunc != nil { - return m.StopSandboxFunc(ctx, sandboxID) + return m.StopSandboxFunc(ctx, sandboxID, force) } return nil, fmt.Errorf("%s: %s (%+v): sandboxID: %v", mockErrorPrefix, getSelf(), m, sandboxID) diff --git a/virtcontainers/pkg/vcmock/mock_test.go b/virtcontainers/pkg/vcmock/mock_test.go index d369e6574a..dc94d740a3 100644 --- a/virtcontainers/pkg/vcmock/mock_test.go +++ b/virtcontainers/pkg/vcmock/mock_test.go @@ -339,22 +339,22 @@ func TestVCMockStopSandbox(t *testing.T) { assert.Nil(m.StopSandboxFunc) ctx := context.Background() - _, err := m.StopSandbox(ctx, testSandboxID) + _, err := m.StopSandbox(ctx, testSandboxID, false) assert.Error(err) assert.True(IsMockError(err)) - m.StopSandboxFunc = func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) { + m.StopSandboxFunc = func(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) { return &Sandbox{}, nil } - sandbox, err := m.StopSandbox(ctx, testSandboxID) + sandbox, err := m.StopSandbox(ctx, testSandboxID, false) assert.NoError(err) assert.Equal(sandbox, &Sandbox{}) // reset m.StopSandboxFunc = nil - _, err = m.StopSandbox(ctx, testSandboxID) + _, err = m.StopSandbox(ctx, testSandboxID, false) assert.Error(err) assert.True(IsMockError(err)) } diff --git a/virtcontainers/pkg/vcmock/types.go b/virtcontainers/pkg/vcmock/types.go index 6f7df5a3d8..9a65389cd3 100644 --- a/virtcontainers/pkg/vcmock/types.go +++ b/virtcontainers/pkg/vcmock/types.go @@ -54,7 +54,7 @@ type VCMock struct { StartSandboxFunc func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) StatusSandboxFunc func(ctx context.Context, sandboxID string) (vc.SandboxStatus, error) StatsContainerFunc func(ctx context.Context, sandboxID, containerID string) (vc.ContainerStats, error) - StopSandboxFunc func(ctx context.Context, sandboxID string) (vc.VCSandbox, error) + StopSandboxFunc func(ctx context.Context, sandboxID string, force bool) (vc.VCSandbox, error) CreateContainerFunc func(ctx context.Context, sandboxID string, containerConfig vc.ContainerConfig) (vc.VCSandbox, vc.VCContainer, error) DeleteContainerFunc func(ctx context.Context, sandboxID, containerID string) (vc.VCContainer, error) From 4130913ed7e44ba527d8a6f58cbb63798bc860c5 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 18 Jul 2019 21:17:48 -0700 Subject: [PATCH 3/9] agent: mark agent dead when failing to connect Whenever we fail to connect, do not make any more attempts. More attempts are possible during cleanup phase but we should not try to connect any more there. Signed-off-by: Peng Tao --- virtcontainers/kata_agent.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index d11f65c217..06b505dbcd 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -136,6 +136,7 @@ type kataAgent struct { keepConn bool proxyBuiltIn bool dynamicTracing bool + dead bool vmSocket interface{} ctx context.Context @@ -1581,6 +1582,9 @@ func (k *kataAgent) statsContainer(sandbox *Sandbox, c Container) (*ContainerSta } func (k *kataAgent) connect() error { + if k.dead { + return errors.New("Dead agent") + } // lockless quick pass if k.client != nil { return nil @@ -1599,6 +1603,7 @@ func (k *kataAgent) connect() error { k.Logger().WithField("url", k.state.URL).Info("New client") client, err := kataclient.NewAgentClient(k.ctx, k.state.URL, k.proxyBuiltIn) if err != nil { + k.dead = true return err } From bc4460e12fa90a67e8fb6801d99ac8d5aebe8f59 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 19 Jul 2019 00:40:34 -0700 Subject: [PATCH 4/9] sandbox: support force stop When force is true, ignore any guest related errors. This can be used to stop a sandbox when hypervisor process is dead. Signed-off-by: Peng Tao --- cli/delete.go | 6 +++--- cli/kill.go | 2 +- containerd-shim-v2/utils.go | 2 +- containerd-shim-v2/wait.go | 2 +- virtcontainers/api.go | 6 +++--- virtcontainers/api_test.go | 8 ++++---- virtcontainers/container.go | 14 +++++++------- virtcontainers/implementation.go | 4 ++-- virtcontainers/interfaces.go | 4 ++-- virtcontainers/pkg/vcmock/sandbox.go | 2 +- virtcontainers/sandbox.go | 11 ++++++----- virtcontainers/sandbox_test.go | 2 +- 12 files changed, 32 insertions(+), 31 deletions(-) diff --git a/cli/delete.go b/cli/delete.go index 19dc4fb7a2..c2ce52a468 100644 --- a/cli/delete.go +++ b/cli/delete.go @@ -112,7 +112,7 @@ func delete(ctx context.Context, containerID string, force bool) error { switch containerType { case vc.PodSandbox: - if err := deleteSandbox(ctx, sandboxID); err != nil { + if err := deleteSandbox(ctx, sandboxID, force); err != nil { return err } case vc.PodContainer: @@ -131,7 +131,7 @@ func delete(ctx context.Context, containerID string, force bool) error { return katautils.DelContainerIDMapping(ctx, containerID) } -func deleteSandbox(ctx context.Context, sandboxID string) error { +func deleteSandbox(ctx context.Context, sandboxID string, force bool) error { span, _ := katautils.Trace(ctx, "deleteSandbox") defer span.Finish() @@ -141,7 +141,7 @@ func deleteSandbox(ctx context.Context, sandboxID string) error { } if oci.StateToOCIState(status.State.State) != oci.StateStopped { - if _, err := vci.StopSandbox(ctx, sandboxID); err != nil { + if _, err := vci.StopSandbox(ctx, sandboxID, force); err != nil { return err } } diff --git a/cli/kill.go b/cli/kill.go index 4c93ecf02f..60fa41e09e 100644 --- a/cli/kill.go +++ b/cli/kill.go @@ -152,7 +152,7 @@ func kill(ctx context.Context, containerID, signal string, all bool) error { switch containerType { case vc.PodSandbox: - _, err = vci.StopSandbox(ctx, sandboxID) + _, err = vci.StopSandbox(ctx, sandboxID, signum == syscall.SIGKILL) case vc.PodContainer: _, err = vci.StopContainer(ctx, sandboxID, containerID) default: diff --git a/containerd-shim-v2/utils.go b/containerd-shim-v2/utils.go index b704b17454..c9c7a15b5e 100644 --- a/containerd-shim-v2/utils.go +++ b/containerd-shim-v2/utils.go @@ -70,7 +70,7 @@ func cleanupContainer(ctx context.Context, sid, cid, bundlePath string) error { } if len(sandbox.GetAllContainers()) == 0 { - err = sandbox.Stop() + err = sandbox.Stop(true) if err != nil { logrus.WithError(err).WithField("sandbox", sid).Warn("failed to stop sandbox") return err diff --git a/containerd-shim-v2/wait.go b/containerd-shim-v2/wait.go index d36408dd5b..92cf23963b 100644 --- a/containerd-shim-v2/wait.go +++ b/containerd-shim-v2/wait.go @@ -51,7 +51,7 @@ func wait(s *service, c *container, execID string) (int32, error) { // sandbox. if c.cType.IsSandbox() { - if err = s.sandbox.Stop(); err != nil { + if err = s.sandbox.Stop(true); err != nil { logrus.WithField("sandbox", s.sandbox.ID()).Error("failed to stop sandbox") } diff --git a/virtcontainers/api.go b/virtcontainers/api.go index 8605b978f4..76c2a96190 100644 --- a/virtcontainers/api.go +++ b/virtcontainers/api.go @@ -235,7 +235,7 @@ func StartSandbox(ctx context.Context, sandboxID string) (VCSandbox, error) { // StopSandbox is the virtcontainers sandbox stopping entry point. // StopSandbox will talk to the given agent to stop an existing sandbox and destroy all containers within that sandbox. -func StopSandbox(ctx context.Context, sandboxID string) (VCSandbox, error) { +func StopSandbox(ctx context.Context, sandboxID string, force bool) (VCSandbox, error) { span, ctx := trace(ctx, "StopSandbox") defer span.Finish() @@ -257,7 +257,7 @@ func StopSandbox(ctx context.Context, sandboxID string) (VCSandbox, error) { defer s.releaseStatelessSandbox() // Stop it. - err = s.Stop() + err = s.Stop(force) if err != nil { return nil, err } @@ -596,7 +596,7 @@ func statusContainer(sandbox *Sandbox, containerID string) (ContainerStatus, err "state": container.state.State, "pid": container.process.Pid}). Info("container isn't running") - if err := container.stop(); err != nil { + if err := container.stop(true); err != nil { return ContainerStatus{}, err } } diff --git a/virtcontainers/api_test.go b/virtcontainers/api_test.go index 371900800d..68a2bc8803 100644 --- a/virtcontainers/api_test.go +++ b/virtcontainers/api_test.go @@ -332,7 +332,7 @@ func TestStopSandboxNoopAgentSuccessful(t *testing.T) { assert.NoError(err) assert.NotNil(p) - vp, err := StopSandbox(ctx, p.ID()) + vp, err := StopSandbox(ctx, p.ID(), false) assert.NoError(err) assert.NotNil(vp) } @@ -427,7 +427,7 @@ func TestStopSandboxKataAgentSuccessful(t *testing.T) { assert.NoError(err) assert.NotNil(p) - p, err = StopSandbox(ctx, p.ID()) + p, err = StopSandbox(ctx, p.ID(), false) assert.NoError(err) assert.NotNil(p) } @@ -438,7 +438,7 @@ func TestStopSandboxFailing(t *testing.T) { sandboxDir := store.SandboxConfigurationRootPath(testSandboxID) os.Remove(sandboxDir) - p, err := StopSandbox(context.Background(), testSandboxID) + p, err := StopSandbox(context.Background(), testSandboxID, false) assert.Error(t, err) assert.Nil(t, p) } @@ -1461,7 +1461,7 @@ func createStartStopDeleteSandbox(b *testing.B, sandboxConfig SandboxConfig) { } // Stop sandbox - _, err = StopSandbox(ctx, p.ID()) + _, err = StopSandbox(ctx, p.ID(), false) if err != nil { b.Fatalf("Could not stop sandbox: %s", err) } diff --git a/virtcontainers/container.go b/virtcontainers/container.go index a8e7dc91c4..4b25a0a472 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -946,7 +946,7 @@ func (c *Container) start() error { if err := c.sandbox.agent.startContainer(c.sandbox, c); err != nil { c.Logger().WithError(err).Error("Failed to start container") - if err := c.stop(); err != nil { + if err := c.stop(true); err != nil { c.Logger().WithError(err).Warn("Failed to stop container") } return err @@ -955,7 +955,7 @@ func (c *Container) start() error { return c.setContainerState(types.StateRunning) } -func (c *Container) stop() error { +func (c *Container) stop(force bool) error { span, _ := c.trace("stop") defer span.Finish() @@ -999,7 +999,7 @@ func (c *Container) stop() error { // return an error, but instead try to kill it forcefully. if err := waitForShim(c.process.Pid); err != nil { // Force the container to be killed. - if err := c.kill(syscall.SIGKILL, true); err != nil { + if err := c.kill(syscall.SIGKILL, true); err != nil && !force { return err } @@ -1007,7 +1007,7 @@ func (c *Container) stop() error { // to succeed. Indeed, we have already given a second chance // to the container by trying to kill it with SIGKILL, there // is no reason to try to go further if we got an error. - if err := waitForShim(c.process.Pid); err != nil { + if err := waitForShim(c.process.Pid); err != nil && !force { return err } } @@ -1045,15 +1045,15 @@ func (c *Container) stop() error { } }() - if err := c.sandbox.agent.stopContainer(c.sandbox, *c); err != nil { + if err := c.sandbox.agent.stopContainer(c.sandbox, *c); err != nil && !force { return err } - if err := c.detachDevices(); err != nil { + if err := c.detachDevices(); err != nil && !force { return err } - if err := c.removeDrive(); err != nil { + if err := c.removeDrive(); err != nil && !force { return err } diff --git a/virtcontainers/implementation.go b/virtcontainers/implementation.go index 865a84271d..fd91535444 100644 --- a/virtcontainers/implementation.go +++ b/virtcontainers/implementation.go @@ -52,8 +52,8 @@ func (impl *VCImpl) StartSandbox(ctx context.Context, sandboxID string) (VCSandb } // StopSandbox implements the VC function of the same name. -func (impl *VCImpl) StopSandbox(ctx context.Context, sandboxID string) (VCSandbox, error) { - return StopSandbox(ctx, sandboxID) +func (impl *VCImpl) StopSandbox(ctx context.Context, sandboxID string, force bool) (VCSandbox, error) { + return StopSandbox(ctx, sandboxID, force) } // RunSandbox implements the VC function of the same name. diff --git a/virtcontainers/interfaces.go b/virtcontainers/interfaces.go index 8ce0e22fa3..38dfc5f4ed 100644 --- a/virtcontainers/interfaces.go +++ b/virtcontainers/interfaces.go @@ -32,7 +32,7 @@ type VC interface { RunSandbox(ctx context.Context, sandboxConfig SandboxConfig) (VCSandbox, error) StartSandbox(ctx context.Context, sandboxID string) (VCSandbox, error) StatusSandbox(ctx context.Context, sandboxID string) (SandboxStatus, error) - StopSandbox(ctx context.Context, sandboxID string) (VCSandbox, error) + StopSandbox(ctx context.Context, sandboxID string, force bool) (VCSandbox, error) CreateContainer(ctx context.Context, sandboxID string, containerConfig ContainerConfig) (VCSandbox, VCContainer, error) DeleteContainer(ctx context.Context, sandboxID, containerID string) (VCContainer, error) @@ -68,7 +68,7 @@ type VCSandbox interface { SetAnnotations(annotations map[string]string) error Start() error - Stop() error + Stop(force bool) error Pause() error Resume() error Release() error diff --git a/virtcontainers/pkg/vcmock/sandbox.go b/virtcontainers/pkg/vcmock/sandbox.go index 2c00d23f82..3d770db8fd 100644 --- a/virtcontainers/pkg/vcmock/sandbox.go +++ b/virtcontainers/pkg/vcmock/sandbox.go @@ -74,7 +74,7 @@ func (s *Sandbox) Start() error { } // Stop implements the VCSandbox function of the same name. -func (s *Sandbox) Stop() error { +func (s *Sandbox) Stop(force bool) error { return nil } diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index 0d43a279d7..fc60c6ab94 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -1161,7 +1161,7 @@ func (s *Sandbox) StopContainer(containerID string) (VCContainer, error) { } // Stop it. - if err := c.stop(); err != nil { + if err := c.stop(false); err != nil { return nil, err } @@ -1427,7 +1427,8 @@ func (s *Sandbox) Start() error { // Stop stops a sandbox. The containers that are making the sandbox // will be destroyed. -func (s *Sandbox) Stop() error { +// When force is true, ignore guest related stop failures. +func (s *Sandbox) Stop(force bool) error { span, _ := s.trace("stop") defer span.Finish() @@ -1441,12 +1442,12 @@ func (s *Sandbox) Stop() error { } for _, c := range s.containers { - if err := c.stop(); err != nil { + if err := c.stop(force); err != nil { return err } } - if err := s.stopVM(); err != nil { + if err := s.stopVM(); err != nil && !force { return err } @@ -1455,7 +1456,7 @@ func (s *Sandbox) Stop() error { } // Remove the network. - if err := s.removeNetwork(); err != nil { + if err := s.removeNetwork(); err != nil && !force { return err } diff --git a/virtcontainers/sandbox_test.go b/virtcontainers/sandbox_test.go index df731dcc9b..2450900023 100644 --- a/virtcontainers/sandbox_test.go +++ b/virtcontainers/sandbox_test.go @@ -1350,7 +1350,7 @@ func TestSandboxStopStopped(t *testing.T) { ctx: context.Background(), state: types.SandboxState{State: types.StateStopped}, } - err := s.Stop() + err := s.Stop(false) assert.Nil(t, err) } From 835b6e9e1b4bfbc7a5926d915babad0d74516923 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 19 Jul 2019 01:07:49 -0700 Subject: [PATCH 5/9] sandbox: do not fail SIGKILL Once we have found the container, we should never fail SIGKILL. It is possible to fail to send SIGKILL because hypervisor might be gone already. If we fail SIGKILL, upper layer cannot really proceed to clean things up. Also there is no need to save sandbox here as we did not change any state. Signed-off-by: Peng Tao --- virtcontainers/sandbox.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index fc60c6ab94..1356bfcc82 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -1180,14 +1180,15 @@ func (s *Sandbox) KillContainer(containerID string, signal syscall.Signal, all b } // Send a signal to the process. - if err := c.kill(signal, all); err != nil { - return err + err = c.kill(signal, all) + + // SIGKILL should never fail otherwise it is + // impossible to clean things up. + if signal == syscall.SIGKILL { + return nil } - if err = s.storeSandbox(); err != nil { - return err - } - return nil + return err } // DeleteContainer deletes a container from the sandbox From 67c401c0591eef4015fe915cd39decc652c3dc5a Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 19 Jul 2019 03:40:10 -0700 Subject: [PATCH 6/9] agent: use hypervisor pid as backup proxy pid for non-kata proxy cases Then we can check hypervisor liveness in those cases to avoid long timeout when connecting to the agent when hypervisor is dead. For kata-agent, we still use the kata-proxy pid for the same purpose. Signed-off-by: Peng Tao --- virtcontainers/kata_agent.go | 17 +++++++++-------- virtcontainers/kata_builtin_proxy.go | 2 +- virtcontainers/no_proxy.go | 2 +- virtcontainers/noop_proxy.go | 2 +- virtcontainers/proxy.go | 1 + 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index 06b505dbcd..2260bb64ad 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -616,6 +616,7 @@ func (k *kataAgent) startProxy(sandbox *Sandbox) error { proxyParams := proxyParams{ id: sandbox.id, + hid: sandbox.hypervisor.pid(), path: sandbox.config.ProxyConfig.Path, agentURL: agentURL, consoleURL: consoleURL, @@ -1600,7 +1601,14 @@ func (k *kataAgent) connect() error { return nil } - k.Logger().WithField("url", k.state.URL).Info("New client") + if k.state.ProxyPid > 0 { + // check that proxy is running before talk with it avoiding long timeouts + if err := syscall.Kill(k.state.ProxyPid, syscall.Signal(0)); err != nil { + return errors.New("Proxy is not running") + } + } + + k.Logger().WithField("url", k.state.URL).WithField("proxy", k.state.ProxyPid).Info("New client") client, err := kataclient.NewAgentClient(k.ctx, k.state.URL, k.proxyBuiltIn) if err != nil { k.dead = true @@ -1792,13 +1800,6 @@ func (k *kataAgent) sendReq(request interface{}) (interface{}, error) { span.SetTag("request", request) defer span.Finish() - if k.state.ProxyPid > 0 { - // check that proxy is running before talk with it avoiding long timeouts - if err := syscall.Kill(k.state.ProxyPid, syscall.Signal(0)); err != nil { - return nil, fmt.Errorf("Proxy is not running: %v", err) - } - } - if err := k.connect(); err != nil { return nil, err } diff --git a/virtcontainers/kata_builtin_proxy.go b/virtcontainers/kata_builtin_proxy.go index 8ac90d84de..a3717dfd32 100644 --- a/virtcontainers/kata_builtin_proxy.go +++ b/virtcontainers/kata_builtin_proxy.go @@ -64,7 +64,7 @@ func (p *kataBuiltInProxy) start(params proxyParams) (int, string, error) { } } - return -1, params.agentURL, nil + return params.hid, params.agentURL, nil } // stop is the proxy stop implementation for kata builtin proxy. diff --git a/virtcontainers/no_proxy.go b/virtcontainers/no_proxy.go index a97270c3d2..8fb54d9f2d 100644 --- a/virtcontainers/no_proxy.go +++ b/virtcontainers/no_proxy.go @@ -34,7 +34,7 @@ func (p *noProxy) start(params proxyParams) (int, string, error) { return -1, "", fmt.Errorf("AgentURL cannot be empty") } - return 0, params.agentURL, nil + return params.hid, params.agentURL, nil } // stop is noProxy stop implementation for proxy interface. diff --git a/virtcontainers/noop_proxy.go b/virtcontainers/noop_proxy.go index 5a80f7b60d..d72f71e574 100644 --- a/virtcontainers/noop_proxy.go +++ b/virtcontainers/noop_proxy.go @@ -14,7 +14,7 @@ var noopProxyURL = "noopProxyURL" // register is the proxy start implementation for testing purpose. // It does nothing. func (p *noopProxy) start(params proxyParams) (int, string, error) { - return 0, noopProxyURL, nil + return params.hid, noopProxyURL, nil } // stop is the proxy stop implementation for testing purpose. diff --git a/virtcontainers/proxy.go b/virtcontainers/proxy.go index 46c266d30a..8c0ec4a49c 100644 --- a/virtcontainers/proxy.go +++ b/virtcontainers/proxy.go @@ -28,6 +28,7 @@ type proxyParams struct { agentURL string consoleURL string logger *logrus.Entry + hid int debug bool } From 262484de688539dd2ef815f6f101b60b7c6ac94e Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 19 Jul 2019 04:01:56 -0700 Subject: [PATCH 7/9] monitor: watch hypervisor When hypervisor process is dead, notify watchers and mark agent dead. Signed-off-by: Peng Tao --- virtcontainers/agent.go | 3 +++ virtcontainers/kata_agent.go | 6 ++++++ virtcontainers/monitor.go | 19 +++++++++++++++++-- virtcontainers/noop_agent.go | 3 +++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/virtcontainers/agent.go b/virtcontainers/agent.go index bd4dd90676..c1e685c168 100644 --- a/virtcontainers/agent.go +++ b/virtcontainers/agent.go @@ -250,6 +250,9 @@ type agent interface { // copyFile copies file from host to container's rootfs copyFile(src, dst string) error + // markDead tell agent that the guest is dead + markDead() + // cleanup removes all on disk information generated by the agent cleanup(id string) } diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index 2260bb64ad..d90ca12f1a 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -2069,6 +2069,12 @@ func (k *kataAgent) copyFile(src, dst string) error { return nil } +func (k *kataAgent) markDead() { + k.Logger().Infof("mark agent dead") + k.dead = true + k.disconnect() +} + func (k *kataAgent) cleanup(id string) { path := k.getSharePath(id) k.Logger().WithField("path", path).Infof("cleanup agent") diff --git a/virtcontainers/monitor.go b/virtcontainers/monitor.go index 8e2a879746..25c55f4250 100644 --- a/virtcontainers/monitor.go +++ b/virtcontainers/monitor.go @@ -7,10 +7,13 @@ package virtcontainers import ( "sync" + "syscall" "time" + + "github.com/pkg/errors" ) -const defaultCheckInterval = 10 * time.Second +const defaultCheckInterval = 1 * time.Second type monitor struct { sync.Mutex @@ -52,6 +55,7 @@ func (m *monitor) newWatcher() (chan error, error) { m.wg.Done() return case <-tick.C: + m.watchHypervisor() m.watchAgent() } } @@ -62,6 +66,8 @@ func (m *monitor) newWatcher() (chan error, error) { } func (m *monitor) notify(err error) { + m.sandbox.agent.markDead() + m.Lock() defer m.Unlock() @@ -115,6 +121,15 @@ func (m *monitor) stop() { func (m *monitor) watchAgent() { err := m.sandbox.agent.check() if err != nil { - m.notify(err) + // TODO: define and export error types + m.notify(errors.Wrapf(err, "failed to ping agent")) } } + +func (m *monitor) watchHypervisor() error { + if err := syscall.Kill(m.sandbox.hypervisor.pid(), syscall.Signal(0)); err != nil { + m.notify(errors.Wrapf(err, "failed to ping hypervisor process")) + return err + } + return nil +} diff --git a/virtcontainers/noop_agent.go b/virtcontainers/noop_agent.go index 592634a70e..5ebfe5faee 100644 --- a/virtcontainers/noop_agent.go +++ b/virtcontainers/noop_agent.go @@ -228,5 +228,8 @@ func (n *noopAgent) copyFile(src, dst string) error { return nil } +func (n *noopAgent) markDead() { +} + func (n *noopAgent) cleanup(id string) { } From e02f6dc067494935f6079069018c4e6c9eb4373c Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Fri, 19 Jul 2019 04:22:43 -0700 Subject: [PATCH 8/9] shimv2: monitor sandbox liveness When sandbox quits unexpected, clean things up as much as we can. Fixes: #1896 Signed-off-by: Peng Tao --- containerd-shim-v2/service.go | 1 + containerd-shim-v2/start.go | 6 ++++++ containerd-shim-v2/wait.go | 40 +++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) diff --git a/containerd-shim-v2/service.go b/containerd-shim-v2/service.go index 9103885080..af3dd47da5 100644 --- a/containerd-shim-v2/service.go +++ b/containerd-shim-v2/service.go @@ -114,6 +114,7 @@ type service struct { containers map[string]*container config *oci.RuntimeConfig events chan interface{} + monitor chan error cancel func() diff --git a/containerd-shim-v2/start.go b/containerd-shim-v2/start.go index 71a90ed131..173ca7c769 100644 --- a/containerd-shim-v2/start.go +++ b/containerd-shim-v2/start.go @@ -30,6 +30,12 @@ func startContainer(ctx context.Context, s *service, c *container) error { if err != nil { return err } + // Start monitor after starting sandbox + s.monitor, err = s.sandbox.Monitor() + if err != nil { + return err + } + go watchSandbox(s) } else { _, err := s.sandbox.StartContainer(c.id) if err != nil { diff --git a/containerd-shim-v2/wait.go b/containerd-shim-v2/wait.go index 92cf23963b..f99bca47f2 100644 --- a/containerd-shim-v2/wait.go +++ b/containerd-shim-v2/wait.go @@ -6,9 +6,11 @@ package containerdshim import ( + "path" "time" "github.com/containerd/containerd/api/types/task" + "github.com/containerd/containerd/mount" "github.com/sirupsen/logrus" ) @@ -82,3 +84,41 @@ func wait(s *service, c *container, execID string) (int32, error) { return ret, nil } + +func watchSandbox(s *service) { + if s.monitor == nil { + return + } + err := <-s.monitor + if err == nil { + return + } + s.monitor = nil + + s.mu.Lock() + defer s.mu.Unlock() + // sandbox malfunctioning, cleanup as much as we can + logrus.WithError(err).Warn("sandbox stopped unexpectedly") + err = s.sandbox.Stop(true) + if err != nil { + logrus.WithError(err).Warn("stop sandbox failed") + } + err = s.sandbox.Delete() + if err != nil { + logrus.WithError(err).Warn("delete sandbox failed") + } + + if s.mount { + for _, c := range s.containers { + rootfs := path.Join(c.bundle, "rootfs") + logrus.WithField("rootfs", rootfs).WithField("id", c.id).Debug("container umount rootfs") + if err := mount.UnmountAll(rootfs, 0); err != nil { + logrus.WithError(err).Warn("failed to cleanup rootfs mount") + } + } + } + s.containers = make(map[string]*container) + + // Existing container/exec will be cleaned up by its waiters. + // No need to send async events here. +} From d5d7d82eeb519dd3585d6b54a1e6fa28e7556af9 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sun, 21 Jul 2019 20:01:04 -0700 Subject: [PATCH 9/9] vc: move container mount cleanup to container.go For one thing, it is container specific resource so it should not be cleaned up by the agent. For another thing, we can make container stop to force cleanup these host mountpoints regardless of hypervisor and agent liveness. Signed-off-by: Peng Tao --- virtcontainers/agent.go | 2 +- virtcontainers/api_test.go | 12 ++++++++++++ virtcontainers/container.go | 8 ++++++++ virtcontainers/kata_agent.go | 22 +++++++--------------- virtcontainers/kata_agent_test.go | 4 ++-- virtcontainers/noop_agent.go | 2 +- virtcontainers/sandbox.go | 2 +- 7 files changed, 32 insertions(+), 20 deletions(-) diff --git a/virtcontainers/agent.go b/virtcontainers/agent.go index c1e685c168..050de36aa9 100644 --- a/virtcontainers/agent.go +++ b/virtcontainers/agent.go @@ -254,5 +254,5 @@ type agent interface { markDead() // cleanup removes all on disk information generated by the agent - cleanup(id string) + cleanup(s *Sandbox) } diff --git a/virtcontainers/api_test.go b/virtcontainers/api_test.go index 68a2bc8803..dff982d534 100644 --- a/virtcontainers/api_test.go +++ b/virtcontainers/api_test.go @@ -322,6 +322,9 @@ func TestStartSandboxFailing(t *testing.T) { } func TestStopSandboxNoopAgentSuccessful(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } defer cleanUp() assert := assert.New(t) @@ -910,6 +913,9 @@ func TestStartContainerFailingSandboxNotStarted(t *testing.T) { } func TestStopContainerNoopAgentSuccessful(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } defer cleanUp() assert := assert.New(t) @@ -942,6 +948,9 @@ func TestStopContainerNoopAgentSuccessful(t *testing.T) { } func TestStopContainerFailingNoSandbox(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } defer cleanUp() contID := "100" @@ -951,6 +960,9 @@ func TestStopContainerFailingNoSandbox(t *testing.T) { } func TestStopContainerFailingNoContainer(t *testing.T) { + if tc.NotValid(ktu.NeedRoot()) { + t.Skip(testDisabledAsNonRoot) + } defer cleanUp() assert := assert.New(t) diff --git a/virtcontainers/container.go b/virtcontainers/container.go index 4b25a0a472..fd52493fb2 100644 --- a/virtcontainers/container.go +++ b/virtcontainers/container.go @@ -1049,6 +1049,14 @@ func (c *Container) stop(force bool) error { return err } + if err := c.unmountHostMounts(); err != nil && !force { + return err + } + + if err := bindUnmountContainerRootfs(c.ctx, kataHostSharedDir, c.sandbox.id, c.id); err != nil && !force { + return err + } + if err := c.detachDevices(); err != nil && !force { return err } diff --git a/virtcontainers/kata_agent.go b/virtcontainers/kata_agent.go index d90ca12f1a..6e1579ee24 100644 --- a/virtcontainers/kata_agent.go +++ b/virtcontainers/kata_agent.go @@ -1419,19 +1419,8 @@ func (k *kataAgent) stopContainer(sandbox *Sandbox, c Container) error { span, _ := k.trace("stopContainer") defer span.Finish() - req := &grpc.RemoveContainerRequest{ - ContainerId: c.id, - } - - if _, err := k.sendReq(req); err != nil { - return err - } - - if err := c.unmountHostMounts(); err != nil { - return err - } - - return bindUnmountContainerRootfs(k.ctx, kataHostSharedDir, sandbox.id, c.id) + _, err := k.sendReq(&grpc.RemoveContainerRequest{ContainerId: c.id}) + return err } func (k *kataAgent) signalProcess(c *Container, processID string, signal syscall.Signal, all bool) error { @@ -2075,9 +2064,12 @@ func (k *kataAgent) markDead() { k.disconnect() } -func (k *kataAgent) cleanup(id string) { - path := k.getSharePath(id) +func (k *kataAgent) cleanup(s *Sandbox) { + path := k.getSharePath(s.id) k.Logger().WithField("path", path).Infof("cleanup agent") + if err := bindUnmountAllRootfs(k.ctx, path, s); err != nil { + k.Logger().WithError(err).Errorf("failed to unmount container share path %s", path) + } if err := os.RemoveAll(path); err != nil { k.Logger().WithError(err).Errorf("failed to cleanup vm share path %s", path) } diff --git a/virtcontainers/kata_agent_test.go b/virtcontainers/kata_agent_test.go index e33524f879..3c4576ad0c 100644 --- a/virtcontainers/kata_agent_test.go +++ b/virtcontainers/kata_agent_test.go @@ -933,8 +933,8 @@ func TestKataCleanupSandbox(t *testing.T) { err := os.MkdirAll(dir, 0777) assert.Nil(err) - k := &kataAgent{} - k.cleanup(s.id) + k := &kataAgent{ctx: context.Background()} + k.cleanup(&s) _, err = os.Stat(dir) assert.False(os.IsExist(err)) diff --git a/virtcontainers/noop_agent.go b/virtcontainers/noop_agent.go index 5ebfe5faee..307caa7f2a 100644 --- a/virtcontainers/noop_agent.go +++ b/virtcontainers/noop_agent.go @@ -231,5 +231,5 @@ func (n *noopAgent) copyFile(src, dst string) error { func (n *noopAgent) markDead() { } -func (n *noopAgent) cleanup(id string) { +func (n *noopAgent) cleanup(s *Sandbox) { } diff --git a/virtcontainers/sandbox.go b/virtcontainers/sandbox.go index 1356bfcc82..3095891a92 100644 --- a/virtcontainers/sandbox.go +++ b/virtcontainers/sandbox.go @@ -764,7 +764,7 @@ func (s *Sandbox) Delete() error { s.Logger().WithError(err).Error("failed to cleanup hypervisor") } - s.agent.cleanup(s.id) + s.agent.cleanup(s) return s.store.Delete() }