shutdown: Don't sever console watcher too early

Fixed logic used to handle static agent tracing.

For a standard (untraced) hypervisor shutdown, the runtime kills the VM
process once the workload has finished. But if static agent tracing is
enabled, the agent running inside the VM is responsible for the
shutdown. The existing code handled this scenario but did not wait for
the hypervisor process to end. The outcome of this being that the
console watcher thread was killed too early.

Although not a problem for an untraced system, if static agent tracing
was enabled, the logs from the hypervisor would be truncated, missing the
crucial final stages of the agents shutdown sequence.

The fix necessitated adding a new parameter to the `stopSandbox()` API,
which if true requests the runtime hypervisor logic simply to wait for
the hypervisor process to exit rather than killing it.

Fixes: #1696.

Signed-off-by: James O. D. Hunt <james.o.hunt@intel.com>
This commit is contained in:
James O. D. Hunt 2021-04-15 12:01:19 +01:00
parent 51ab870091
commit 9256e590dc
9 changed files with 51 additions and 28 deletions

View File

@ -479,7 +479,7 @@ func (a *Acrn) waitSandbox(ctx context.Context, timeoutSecs int) error {
}
// stopSandbox will stop the Sandbox's VM.
func (a *Acrn) stopSandbox(ctx context.Context) (err error) {
func (a *Acrn) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
span, _ := a.trace(ctx, "stopSandbox")
defer span.End()
@ -513,6 +513,11 @@ func (a *Acrn) stopSandbox(ctx context.Context) (err error) {
shutdownSignal := syscall.SIGINT
if waitOnly {
// NOP
shutdownSignal = syscall.Signal(0)
}
return utils.WaitLocalProcess(pid, acrnStopSandboxTimeoutSecs, shutdownSignal, a.Logger())
}

View File

@ -652,11 +652,11 @@ func (clh *cloudHypervisor) resumeSandbox(ctx context.Context) error {
}
// stopSandbox will stop the Sandbox's VM.
func (clh *cloudHypervisor) stopSandbox(ctx context.Context) (err error) {
func (clh *cloudHypervisor) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
span, _ := clh.trace(ctx, "stopSandbox")
defer span.End()
clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox")
return clh.terminate(ctx)
return clh.terminate(ctx, waitOnly)
}
func (clh *cloudHypervisor) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error {
@ -756,7 +756,7 @@ func (clh *cloudHypervisor) trace(parent context.Context, name string) (otelTrac
return span, ctx
}
func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) {
func (clh *cloudHypervisor) terminate(ctx context.Context, waitOnly bool) (err error) {
span, _ := clh.trace(ctx, "terminate")
defer span.End()
@ -775,7 +775,7 @@ func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) {
clh.Logger().Debug("Stopping Cloud Hypervisor")
if pidRunning {
if pidRunning && !waitOnly {
clhRunning, _ := clh.isClhRunning(clhStopSandboxTimeout)
if clhRunning {
ctx, cancel := context.WithTimeout(context.Background(), clhStopSandboxTimeout*time.Second)

View File

@ -409,7 +409,7 @@ func (fc *firecracker) fcInit(ctx context.Context, timeout int) error {
return nil
}
func (fc *firecracker) fcEnd(ctx context.Context) (err error) {
func (fc *firecracker) fcEnd(ctx context.Context, waitOnly bool) (err error) {
span, _ := fc.trace(ctx, "fcEnd")
defer span.End()
@ -427,6 +427,11 @@ func (fc *firecracker) fcEnd(ctx context.Context) (err error) {
shutdownSignal := syscall.SIGTERM
if waitOnly {
// NOP
shutdownSignal = syscall.Signal(0)
}
// Wait for the VM process to terminate
return utils.WaitLocalProcess(pid, fcStopSandboxTimeout, shutdownSignal, fc.Logger())
}
@ -760,7 +765,7 @@ func (fc *firecracker) startSandbox(ctx context.Context, timeout int) error {
var err error
defer func() {
if err != nil {
fc.fcEnd(ctx)
fc.fcEnd(ctx, false)
}
}()
@ -853,11 +858,11 @@ func (fc *firecracker) cleanupJail(ctx context.Context) {
}
// stopSandbox will stop the Sandbox's VM.
func (fc *firecracker) stopSandbox(ctx context.Context) (err error) {
func (fc *firecracker) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
span, _ := fc.trace(ctx, "stopSandbox")
defer span.End()
return fc.fcEnd(ctx)
return fc.fcEnd(ctx, waitOnly)
}
func (fc *firecracker) pauseSandbox(ctx context.Context) error {

View File

@ -787,7 +787,9 @@ func generateVMSocket(id string, vmStogarePath string) (interface{}, error) {
type hypervisor interface {
createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig) error
startSandbox(ctx context.Context, timeout int) error
stopSandbox(ctx context.Context) error
// If wait is set, don't actively stop the sandbox:
// just perform cleanup.
stopSandbox(ctx context.Context, waitOnly bool) error
pauseSandbox(ctx context.Context) error
saveSandbox() error
resumeSandbox(ctx context.Context) error

View File

@ -41,7 +41,7 @@ func (m *mockHypervisor) startSandbox(ctx context.Context, timeout int) error {
return nil
}
func (m *mockHypervisor) stopSandbox(ctx context.Context) error {
func (m *mockHypervisor) stopSandbox(ctx context.Context, waitOnly bool) error {
return nil
}

View File

@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) {
func TestMockHypervisorStopSandbox(t *testing.T) {
var m *mockHypervisor
assert.NoError(t, m.stopSandbox(context.Background()))
assert.NoError(t, m.stopSandbox(context.Background(), false))
}
func TestMockHypervisorAddDevice(t *testing.T) {

View File

@ -122,6 +122,8 @@ const (
scsiControllerID = "scsi0"
rngID = "rng0"
fallbackFileBackedMemDir = "/dev/shm"
qemuStopSandboxTimeoutSecs = 15
)
// agnostic list of kernel parameters
@ -703,7 +705,7 @@ func (q *qemu) setupVirtiofsd(ctx context.Context) (err error) {
q.Logger().Info("virtiofsd quits")
// Wait to release resources of virtiofsd process
cmd.Process.Wait()
q.stopSandbox(ctx)
q.stopSandbox(ctx, false)
}()
return err
}
@ -933,7 +935,7 @@ func (q *qemu) waitSandbox(ctx context.Context, timeout int) error {
}
// stopSandbox will stop the Sandbox's VM.
func (q *qemu) stopSandbox(ctx context.Context) error {
func (q *qemu) stopSandbox(ctx context.Context, waitOnly bool) error {
span, _ := q.trace(ctx, "stopSandbox")
defer span.End()
@ -965,10 +967,24 @@ func (q *qemu) stopSandbox(ctx context.Context) error {
return err
}
err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
return err
if waitOnly {
pids := q.getPids()
if len(pids) == 0 {
return errors.New("cannot determine QEMU PID")
}
pid := pids[0]
err := utils.WaitLocalProcess(pid, qemuStopSandboxTimeoutSecs, syscall.Signal(0), q.Logger())
if err != nil {
return err
}
} else {
err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
if err != nil {
q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
return err
}
}
return nil

View File

@ -1027,7 +1027,7 @@ func (s *Sandbox) startVM(ctx context.Context) (err error) {
defer func() {
if err != nil {
s.hypervisor.stopSandbox(ctx)
s.hypervisor.stopSandbox(ctx, false)
}
}()
@ -1081,14 +1081,9 @@ func (s *Sandbox) stopVM(ctx context.Context) error {
s.Logger().WithError(err).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox")
}
if s.disableVMShutdown {
// Do not kill the VM - allow the agent to shut it down
// (only used to support static agent tracing).
return nil
}
s.Logger().Info("Stopping VM")
return s.hypervisor.stopSandbox(ctx)
return s.hypervisor.stopSandbox(ctx, s.disableVMShutdown)
}
func (s *Sandbox) addContainer(c *Container) error {

View File

@ -137,7 +137,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) {
defer func() {
if err != nil {
virtLog.WithField("vm", id).WithError(err).Info("clean up vm")
hypervisor.stopSandbox(ctx)
hypervisor.stopSandbox(ctx, false)
}
}()
@ -251,7 +251,7 @@ func (v *VM) Disconnect(ctx context.Context) error {
func (v *VM) Stop(ctx context.Context) error {
v.logger().Info("stop vm")
if err := v.hypervisor.stopSandbox(ctx); err != nil {
if err := v.hypervisor.stopSandbox(ctx, false); err != nil {
return err
}