From 9256e590dce9b38869946dfa47a3d596942fc956 Mon Sep 17 00:00:00 2001 From: "James O. D. Hunt" Date: Thu, 15 Apr 2021 12:01:19 +0100 Subject: [PATCH] shutdown: Don't sever console watcher too early Fixed logic used to handle static agent tracing. For a standard (untraced) hypervisor shutdown, the runtime kills the VM process once the workload has finished. But if static agent tracing is enabled, the agent running inside the VM is responsible for the shutdown. The existing code handled this scenario but did not wait for the hypervisor process to end. The outcome of this being that the console watcher thread was killed too early. Although not a problem for an untraced system, if static agent tracing was enabled, the logs from the hypervisor would be truncated, missing the crucial final stages of the agents shutdown sequence. The fix necessitated adding a new parameter to the `stopSandbox()` API, which if true requests the runtime hypervisor logic simply to wait for the hypervisor process to exit rather than killing it. Fixes: #1696. Signed-off-by: James O. D. Hunt --- src/runtime/virtcontainers/acrn.go | 7 ++++- src/runtime/virtcontainers/clh.go | 8 +++--- src/runtime/virtcontainers/fc.go | 13 ++++++--- src/runtime/virtcontainers/hypervisor.go | 4 ++- src/runtime/virtcontainers/mock_hypervisor.go | 2 +- .../virtcontainers/mock_hypervisor_test.go | 2 +- src/runtime/virtcontainers/qemu.go | 28 +++++++++++++++---- src/runtime/virtcontainers/sandbox.go | 11 ++------ src/runtime/virtcontainers/vm.go | 4 +-- 9 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index 3f99350438..93359e16ad 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -479,7 +479,7 @@ func (a *Acrn) waitSandbox(ctx context.Context, timeoutSecs int) error { } // stopSandbox will stop the Sandbox's VM. -func (a *Acrn) stopSandbox(ctx context.Context) (err error) { +func (a *Acrn) stopSandbox(ctx context.Context, waitOnly bool) (err error) { span, _ := a.trace(ctx, "stopSandbox") defer span.End() @@ -513,6 +513,11 @@ func (a *Acrn) stopSandbox(ctx context.Context) (err error) { shutdownSignal := syscall.SIGINT + if waitOnly { + // NOP + shutdownSignal = syscall.Signal(0) + } + return utils.WaitLocalProcess(pid, acrnStopSandboxTimeoutSecs, shutdownSignal, a.Logger()) } diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 2fc74c0760..3f9a364d6d 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -652,11 +652,11 @@ func (clh *cloudHypervisor) resumeSandbox(ctx context.Context) error { } // stopSandbox will stop the Sandbox's VM. -func (clh *cloudHypervisor) stopSandbox(ctx context.Context) (err error) { +func (clh *cloudHypervisor) stopSandbox(ctx context.Context, waitOnly bool) (err error) { span, _ := clh.trace(ctx, "stopSandbox") defer span.End() clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox") - return clh.terminate(ctx) + return clh.terminate(ctx, waitOnly) } func (clh *cloudHypervisor) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error { @@ -756,7 +756,7 @@ func (clh *cloudHypervisor) trace(parent context.Context, name string) (otelTrac return span, ctx } -func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) { +func (clh *cloudHypervisor) terminate(ctx context.Context, waitOnly bool) (err error) { span, _ := clh.trace(ctx, "terminate") defer span.End() @@ -775,7 +775,7 @@ func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) { clh.Logger().Debug("Stopping Cloud Hypervisor") - if pidRunning { + if pidRunning && !waitOnly { clhRunning, _ := clh.isClhRunning(clhStopSandboxTimeout) if clhRunning { ctx, cancel := context.WithTimeout(context.Background(), clhStopSandboxTimeout*time.Second) diff --git a/src/runtime/virtcontainers/fc.go b/src/runtime/virtcontainers/fc.go index c0644ed718..7dac0e78a0 100644 --- a/src/runtime/virtcontainers/fc.go +++ b/src/runtime/virtcontainers/fc.go @@ -409,7 +409,7 @@ func (fc *firecracker) fcInit(ctx context.Context, timeout int) error { return nil } -func (fc *firecracker) fcEnd(ctx context.Context) (err error) { +func (fc *firecracker) fcEnd(ctx context.Context, waitOnly bool) (err error) { span, _ := fc.trace(ctx, "fcEnd") defer span.End() @@ -427,6 +427,11 @@ func (fc *firecracker) fcEnd(ctx context.Context) (err error) { shutdownSignal := syscall.SIGTERM + if waitOnly { + // NOP + shutdownSignal = syscall.Signal(0) + } + // Wait for the VM process to terminate return utils.WaitLocalProcess(pid, fcStopSandboxTimeout, shutdownSignal, fc.Logger()) } @@ -760,7 +765,7 @@ func (fc *firecracker) startSandbox(ctx context.Context, timeout int) error { var err error defer func() { if err != nil { - fc.fcEnd(ctx) + fc.fcEnd(ctx, false) } }() @@ -853,11 +858,11 @@ func (fc *firecracker) cleanupJail(ctx context.Context) { } // stopSandbox will stop the Sandbox's VM. -func (fc *firecracker) stopSandbox(ctx context.Context) (err error) { +func (fc *firecracker) stopSandbox(ctx context.Context, waitOnly bool) (err error) { span, _ := fc.trace(ctx, "stopSandbox") defer span.End() - return fc.fcEnd(ctx) + return fc.fcEnd(ctx, waitOnly) } func (fc *firecracker) pauseSandbox(ctx context.Context) error { diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index a7fb093cd9..dc6fedf063 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -787,7 +787,9 @@ func generateVMSocket(id string, vmStogarePath string) (interface{}, error) { type hypervisor interface { createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig) error startSandbox(ctx context.Context, timeout int) error - stopSandbox(ctx context.Context) error + // If wait is set, don't actively stop the sandbox: + // just perform cleanup. + stopSandbox(ctx context.Context, waitOnly bool) error pauseSandbox(ctx context.Context) error saveSandbox() error resumeSandbox(ctx context.Context) error diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go index fd2a6161b7..af19130f40 100644 --- a/src/runtime/virtcontainers/mock_hypervisor.go +++ b/src/runtime/virtcontainers/mock_hypervisor.go @@ -41,7 +41,7 @@ func (m *mockHypervisor) startSandbox(ctx context.Context, timeout int) error { return nil } -func (m *mockHypervisor) stopSandbox(ctx context.Context) error { +func (m *mockHypervisor) stopSandbox(ctx context.Context, waitOnly bool) error { return nil } diff --git a/src/runtime/virtcontainers/mock_hypervisor_test.go b/src/runtime/virtcontainers/mock_hypervisor_test.go index dece251652..c557f86580 100644 --- a/src/runtime/virtcontainers/mock_hypervisor_test.go +++ b/src/runtime/virtcontainers/mock_hypervisor_test.go @@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) { func TestMockHypervisorStopSandbox(t *testing.T) { var m *mockHypervisor - assert.NoError(t, m.stopSandbox(context.Background())) + assert.NoError(t, m.stopSandbox(context.Background(), false)) } func TestMockHypervisorAddDevice(t *testing.T) { diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 98f9a844ee..e44d16faa9 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -122,6 +122,8 @@ const ( scsiControllerID = "scsi0" rngID = "rng0" fallbackFileBackedMemDir = "/dev/shm" + + qemuStopSandboxTimeoutSecs = 15 ) // agnostic list of kernel parameters @@ -703,7 +705,7 @@ func (q *qemu) setupVirtiofsd(ctx context.Context) (err error) { q.Logger().Info("virtiofsd quits") // Wait to release resources of virtiofsd process cmd.Process.Wait() - q.stopSandbox(ctx) + q.stopSandbox(ctx, false) }() return err } @@ -933,7 +935,7 @@ func (q *qemu) waitSandbox(ctx context.Context, timeout int) error { } // stopSandbox will stop the Sandbox's VM. -func (q *qemu) stopSandbox(ctx context.Context) error { +func (q *qemu) stopSandbox(ctx context.Context, waitOnly bool) error { span, _ := q.trace(ctx, "stopSandbox") defer span.End() @@ -965,10 +967,24 @@ func (q *qemu) stopSandbox(ctx context.Context) error { return err } - err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx) - if err != nil { - q.Logger().WithError(err).Error("Fail to execute qmp QUIT") - return err + if waitOnly { + pids := q.getPids() + if len(pids) == 0 { + return errors.New("cannot determine QEMU PID") + } + + pid := pids[0] + + err := utils.WaitLocalProcess(pid, qemuStopSandboxTimeoutSecs, syscall.Signal(0), q.Logger()) + if err != nil { + return err + } + } else { + err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx) + if err != nil { + q.Logger().WithError(err).Error("Fail to execute qmp QUIT") + return err + } } return nil diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index e758071fdb..a9370e0d58 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -1027,7 +1027,7 @@ func (s *Sandbox) startVM(ctx context.Context) (err error) { defer func() { if err != nil { - s.hypervisor.stopSandbox(ctx) + s.hypervisor.stopSandbox(ctx, false) } }() @@ -1081,14 +1081,9 @@ func (s *Sandbox) stopVM(ctx context.Context) error { s.Logger().WithError(err).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox") } - if s.disableVMShutdown { - // Do not kill the VM - allow the agent to shut it down - // (only used to support static agent tracing). - return nil - } - s.Logger().Info("Stopping VM") - return s.hypervisor.stopSandbox(ctx) + + return s.hypervisor.stopSandbox(ctx, s.disableVMShutdown) } func (s *Sandbox) addContainer(c *Container) error { diff --git a/src/runtime/virtcontainers/vm.go b/src/runtime/virtcontainers/vm.go index c94a26fc5b..e6f02b6e07 100644 --- a/src/runtime/virtcontainers/vm.go +++ b/src/runtime/virtcontainers/vm.go @@ -137,7 +137,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) { defer func() { if err != nil { virtLog.WithField("vm", id).WithError(err).Info("clean up vm") - hypervisor.stopSandbox(ctx) + hypervisor.stopSandbox(ctx, false) } }() @@ -251,7 +251,7 @@ func (v *VM) Disconnect(ctx context.Context) error { func (v *VM) Stop(ctx context.Context) error { v.logger().Info("stop vm") - if err := v.hypervisor.stopSandbox(ctx); err != nil { + if err := v.hypervisor.stopSandbox(ctx, false); err != nil { return err }