shutdown: Don't sever console watcher too early

Fixed logic used to handle static agent tracing.

For a standard (untraced) hypervisor shutdown, the runtime kills the VM
process once the workload has finished. But if static agent tracing is
enabled, the agent running inside the VM is responsible for the
shutdown. The existing code handled this scenario but did not wait for
the hypervisor process to end. The outcome of this being that the
console watcher thread was killed too early.

Although not a problem for an untraced system, if static agent tracing
was enabled, the logs from the hypervisor would be truncated, missing the
crucial final stages of the agents shutdown sequence.

The fix necessitated adding a new parameter to the `stopSandbox()` API,
which if true requests the runtime hypervisor logic simply to wait for
the hypervisor process to exit rather than killing it.

Fixes: #1696.

Signed-off-by: James O. D. Hunt <james.o.hunt@intel.com>
This commit is contained in:
James O. D. Hunt 2021-04-15 12:01:19 +01:00
parent 51ab870091
commit 9256e590dc
9 changed files with 51 additions and 28 deletions

View File

@ -479,7 +479,7 @@ func (a *Acrn) waitSandbox(ctx context.Context, timeoutSecs int) error {
} }
// stopSandbox will stop the Sandbox's VM. // stopSandbox will stop the Sandbox's VM.
func (a *Acrn) stopSandbox(ctx context.Context) (err error) { func (a *Acrn) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
span, _ := a.trace(ctx, "stopSandbox") span, _ := a.trace(ctx, "stopSandbox")
defer span.End() defer span.End()
@ -513,6 +513,11 @@ func (a *Acrn) stopSandbox(ctx context.Context) (err error) {
shutdownSignal := syscall.SIGINT shutdownSignal := syscall.SIGINT
if waitOnly {
// NOP
shutdownSignal = syscall.Signal(0)
}
return utils.WaitLocalProcess(pid, acrnStopSandboxTimeoutSecs, shutdownSignal, a.Logger()) return utils.WaitLocalProcess(pid, acrnStopSandboxTimeoutSecs, shutdownSignal, a.Logger())
} }

View File

@ -652,11 +652,11 @@ func (clh *cloudHypervisor) resumeSandbox(ctx context.Context) error {
} }
// stopSandbox will stop the Sandbox's VM. // stopSandbox will stop the Sandbox's VM.
func (clh *cloudHypervisor) stopSandbox(ctx context.Context) (err error) { func (clh *cloudHypervisor) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
span, _ := clh.trace(ctx, "stopSandbox") span, _ := clh.trace(ctx, "stopSandbox")
defer span.End() defer span.End()
clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox") clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox")
return clh.terminate(ctx) return clh.terminate(ctx, waitOnly)
} }
func (clh *cloudHypervisor) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error { func (clh *cloudHypervisor) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error {
@ -756,7 +756,7 @@ func (clh *cloudHypervisor) trace(parent context.Context, name string) (otelTrac
return span, ctx return span, ctx
} }
func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) { func (clh *cloudHypervisor) terminate(ctx context.Context, waitOnly bool) (err error) {
span, _ := clh.trace(ctx, "terminate") span, _ := clh.trace(ctx, "terminate")
defer span.End() defer span.End()
@ -775,7 +775,7 @@ func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) {
clh.Logger().Debug("Stopping Cloud Hypervisor") clh.Logger().Debug("Stopping Cloud Hypervisor")
if pidRunning { if pidRunning && !waitOnly {
clhRunning, _ := clh.isClhRunning(clhStopSandboxTimeout) clhRunning, _ := clh.isClhRunning(clhStopSandboxTimeout)
if clhRunning { if clhRunning {
ctx, cancel := context.WithTimeout(context.Background(), clhStopSandboxTimeout*time.Second) ctx, cancel := context.WithTimeout(context.Background(), clhStopSandboxTimeout*time.Second)

View File

@ -409,7 +409,7 @@ func (fc *firecracker) fcInit(ctx context.Context, timeout int) error {
return nil return nil
} }
func (fc *firecracker) fcEnd(ctx context.Context) (err error) { func (fc *firecracker) fcEnd(ctx context.Context, waitOnly bool) (err error) {
span, _ := fc.trace(ctx, "fcEnd") span, _ := fc.trace(ctx, "fcEnd")
defer span.End() defer span.End()
@ -427,6 +427,11 @@ func (fc *firecracker) fcEnd(ctx context.Context) (err error) {
shutdownSignal := syscall.SIGTERM shutdownSignal := syscall.SIGTERM
if waitOnly {
// NOP
shutdownSignal = syscall.Signal(0)
}
// Wait for the VM process to terminate // Wait for the VM process to terminate
return utils.WaitLocalProcess(pid, fcStopSandboxTimeout, shutdownSignal, fc.Logger()) return utils.WaitLocalProcess(pid, fcStopSandboxTimeout, shutdownSignal, fc.Logger())
} }
@ -760,7 +765,7 @@ func (fc *firecracker) startSandbox(ctx context.Context, timeout int) error {
var err error var err error
defer func() { defer func() {
if err != nil { if err != nil {
fc.fcEnd(ctx) fc.fcEnd(ctx, false)
} }
}() }()
@ -853,11 +858,11 @@ func (fc *firecracker) cleanupJail(ctx context.Context) {
} }
// stopSandbox will stop the Sandbox's VM. // stopSandbox will stop the Sandbox's VM.
func (fc *firecracker) stopSandbox(ctx context.Context) (err error) { func (fc *firecracker) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
span, _ := fc.trace(ctx, "stopSandbox") span, _ := fc.trace(ctx, "stopSandbox")
defer span.End() defer span.End()
return fc.fcEnd(ctx) return fc.fcEnd(ctx, waitOnly)
} }
func (fc *firecracker) pauseSandbox(ctx context.Context) error { func (fc *firecracker) pauseSandbox(ctx context.Context) error {

View File

@ -787,7 +787,9 @@ func generateVMSocket(id string, vmStogarePath string) (interface{}, error) {
type hypervisor interface { type hypervisor interface {
createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig) error createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig) error
startSandbox(ctx context.Context, timeout int) error startSandbox(ctx context.Context, timeout int) error
stopSandbox(ctx context.Context) error // If wait is set, don't actively stop the sandbox:
// just perform cleanup.
stopSandbox(ctx context.Context, waitOnly bool) error
pauseSandbox(ctx context.Context) error pauseSandbox(ctx context.Context) error
saveSandbox() error saveSandbox() error
resumeSandbox(ctx context.Context) error resumeSandbox(ctx context.Context) error

View File

@ -41,7 +41,7 @@ func (m *mockHypervisor) startSandbox(ctx context.Context, timeout int) error {
return nil return nil
} }
func (m *mockHypervisor) stopSandbox(ctx context.Context) error { func (m *mockHypervisor) stopSandbox(ctx context.Context, waitOnly bool) error {
return nil return nil
} }

View File

@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) {
func TestMockHypervisorStopSandbox(t *testing.T) { func TestMockHypervisorStopSandbox(t *testing.T) {
var m *mockHypervisor var m *mockHypervisor
assert.NoError(t, m.stopSandbox(context.Background())) assert.NoError(t, m.stopSandbox(context.Background(), false))
} }
func TestMockHypervisorAddDevice(t *testing.T) { func TestMockHypervisorAddDevice(t *testing.T) {

View File

@ -122,6 +122,8 @@ const (
scsiControllerID = "scsi0" scsiControllerID = "scsi0"
rngID = "rng0" rngID = "rng0"
fallbackFileBackedMemDir = "/dev/shm" fallbackFileBackedMemDir = "/dev/shm"
qemuStopSandboxTimeoutSecs = 15
) )
// agnostic list of kernel parameters // agnostic list of kernel parameters
@ -703,7 +705,7 @@ func (q *qemu) setupVirtiofsd(ctx context.Context) (err error) {
q.Logger().Info("virtiofsd quits") q.Logger().Info("virtiofsd quits")
// Wait to release resources of virtiofsd process // Wait to release resources of virtiofsd process
cmd.Process.Wait() cmd.Process.Wait()
q.stopSandbox(ctx) q.stopSandbox(ctx, false)
}() }()
return err return err
} }
@ -933,7 +935,7 @@ func (q *qemu) waitSandbox(ctx context.Context, timeout int) error {
} }
// stopSandbox will stop the Sandbox's VM. // stopSandbox will stop the Sandbox's VM.
func (q *qemu) stopSandbox(ctx context.Context) error { func (q *qemu) stopSandbox(ctx context.Context, waitOnly bool) error {
span, _ := q.trace(ctx, "stopSandbox") span, _ := q.trace(ctx, "stopSandbox")
defer span.End() defer span.End()
@ -965,10 +967,24 @@ func (q *qemu) stopSandbox(ctx context.Context) error {
return err return err
} }
err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx) if waitOnly {
if err != nil { pids := q.getPids()
q.Logger().WithError(err).Error("Fail to execute qmp QUIT") if len(pids) == 0 {
return err return errors.New("cannot determine QEMU PID")
}
pid := pids[0]
err := utils.WaitLocalProcess(pid, qemuStopSandboxTimeoutSecs, syscall.Signal(0), q.Logger())
if err != nil {
return err
}
} else {
err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
return err
}
} }
return nil return nil

View File

@ -1027,7 +1027,7 @@ func (s *Sandbox) startVM(ctx context.Context) (err error) {
defer func() { defer func() {
if err != nil { if err != nil {
s.hypervisor.stopSandbox(ctx) s.hypervisor.stopSandbox(ctx, false)
} }
}() }()
@ -1081,14 +1081,9 @@ func (s *Sandbox) stopVM(ctx context.Context) error {
s.Logger().WithError(err).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox") s.Logger().WithError(err).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox")
} }
if s.disableVMShutdown {
// Do not kill the VM - allow the agent to shut it down
// (only used to support static agent tracing).
return nil
}
s.Logger().Info("Stopping VM") s.Logger().Info("Stopping VM")
return s.hypervisor.stopSandbox(ctx)
return s.hypervisor.stopSandbox(ctx, s.disableVMShutdown)
} }
func (s *Sandbox) addContainer(c *Container) error { func (s *Sandbox) addContainer(c *Container) error {

View File

@ -137,7 +137,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) {
defer func() { defer func() {
if err != nil { if err != nil {
virtLog.WithField("vm", id).WithError(err).Info("clean up vm") virtLog.WithField("vm", id).WithError(err).Info("clean up vm")
hypervisor.stopSandbox(ctx) hypervisor.stopSandbox(ctx, false)
} }
}() }()
@ -251,7 +251,7 @@ func (v *VM) Disconnect(ctx context.Context) error {
func (v *VM) Stop(ctx context.Context) error { func (v *VM) Stop(ctx context.Context) error {
v.logger().Info("stop vm") v.logger().Info("stop vm")
if err := v.hypervisor.stopSandbox(ctx); err != nil { if err := v.hypervisor.stopSandbox(ctx, false); err != nil {
return err return err
} }