mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-06-25 06:52:13 +00:00
shutdown: Don't sever console watcher too early
Fixed logic used to handle static agent tracing. For a standard (untraced) hypervisor shutdown, the runtime kills the VM process once the workload has finished. But if static agent tracing is enabled, the agent running inside the VM is responsible for the shutdown. The existing code handled this scenario but did not wait for the hypervisor process to end. The outcome of this being that the console watcher thread was killed too early. Although not a problem for an untraced system, if static agent tracing was enabled, the logs from the hypervisor would be truncated, missing the crucial final stages of the agents shutdown sequence. The fix necessitated adding a new parameter to the `stopSandbox()` API, which if true requests the runtime hypervisor logic simply to wait for the hypervisor process to exit rather than killing it. Fixes: #1696. Signed-off-by: James O. D. Hunt <james.o.hunt@intel.com>
This commit is contained in:
parent
51ab870091
commit
9256e590dc
@ -479,7 +479,7 @@ func (a *Acrn) waitSandbox(ctx context.Context, timeoutSecs int) error {
|
||||
}
|
||||
|
||||
// stopSandbox will stop the Sandbox's VM.
|
||||
func (a *Acrn) stopSandbox(ctx context.Context) (err error) {
|
||||
func (a *Acrn) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
|
||||
span, _ := a.trace(ctx, "stopSandbox")
|
||||
defer span.End()
|
||||
|
||||
@ -513,6 +513,11 @@ func (a *Acrn) stopSandbox(ctx context.Context) (err error) {
|
||||
|
||||
shutdownSignal := syscall.SIGINT
|
||||
|
||||
if waitOnly {
|
||||
// NOP
|
||||
shutdownSignal = syscall.Signal(0)
|
||||
}
|
||||
|
||||
return utils.WaitLocalProcess(pid, acrnStopSandboxTimeoutSecs, shutdownSignal, a.Logger())
|
||||
}
|
||||
|
||||
|
@ -652,11 +652,11 @@ func (clh *cloudHypervisor) resumeSandbox(ctx context.Context) error {
|
||||
}
|
||||
|
||||
// stopSandbox will stop the Sandbox's VM.
|
||||
func (clh *cloudHypervisor) stopSandbox(ctx context.Context) (err error) {
|
||||
func (clh *cloudHypervisor) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
|
||||
span, _ := clh.trace(ctx, "stopSandbox")
|
||||
defer span.End()
|
||||
clh.Logger().WithField("function", "stopSandbox").Info("Stop Sandbox")
|
||||
return clh.terminate(ctx)
|
||||
return clh.terminate(ctx, waitOnly)
|
||||
}
|
||||
|
||||
func (clh *cloudHypervisor) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error {
|
||||
@ -756,7 +756,7 @@ func (clh *cloudHypervisor) trace(parent context.Context, name string) (otelTrac
|
||||
return span, ctx
|
||||
}
|
||||
|
||||
func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) {
|
||||
func (clh *cloudHypervisor) terminate(ctx context.Context, waitOnly bool) (err error) {
|
||||
span, _ := clh.trace(ctx, "terminate")
|
||||
defer span.End()
|
||||
|
||||
@ -775,7 +775,7 @@ func (clh *cloudHypervisor) terminate(ctx context.Context) (err error) {
|
||||
|
||||
clh.Logger().Debug("Stopping Cloud Hypervisor")
|
||||
|
||||
if pidRunning {
|
||||
if pidRunning && !waitOnly {
|
||||
clhRunning, _ := clh.isClhRunning(clhStopSandboxTimeout)
|
||||
if clhRunning {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), clhStopSandboxTimeout*time.Second)
|
||||
|
@ -409,7 +409,7 @@ func (fc *firecracker) fcInit(ctx context.Context, timeout int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fc *firecracker) fcEnd(ctx context.Context) (err error) {
|
||||
func (fc *firecracker) fcEnd(ctx context.Context, waitOnly bool) (err error) {
|
||||
span, _ := fc.trace(ctx, "fcEnd")
|
||||
defer span.End()
|
||||
|
||||
@ -427,6 +427,11 @@ func (fc *firecracker) fcEnd(ctx context.Context) (err error) {
|
||||
|
||||
shutdownSignal := syscall.SIGTERM
|
||||
|
||||
if waitOnly {
|
||||
// NOP
|
||||
shutdownSignal = syscall.Signal(0)
|
||||
}
|
||||
|
||||
// Wait for the VM process to terminate
|
||||
return utils.WaitLocalProcess(pid, fcStopSandboxTimeout, shutdownSignal, fc.Logger())
|
||||
}
|
||||
@ -760,7 +765,7 @@ func (fc *firecracker) startSandbox(ctx context.Context, timeout int) error {
|
||||
var err error
|
||||
defer func() {
|
||||
if err != nil {
|
||||
fc.fcEnd(ctx)
|
||||
fc.fcEnd(ctx, false)
|
||||
}
|
||||
}()
|
||||
|
||||
@ -853,11 +858,11 @@ func (fc *firecracker) cleanupJail(ctx context.Context) {
|
||||
}
|
||||
|
||||
// stopSandbox will stop the Sandbox's VM.
|
||||
func (fc *firecracker) stopSandbox(ctx context.Context) (err error) {
|
||||
func (fc *firecracker) stopSandbox(ctx context.Context, waitOnly bool) (err error) {
|
||||
span, _ := fc.trace(ctx, "stopSandbox")
|
||||
defer span.End()
|
||||
|
||||
return fc.fcEnd(ctx)
|
||||
return fc.fcEnd(ctx, waitOnly)
|
||||
}
|
||||
|
||||
func (fc *firecracker) pauseSandbox(ctx context.Context) error {
|
||||
|
@ -787,7 +787,9 @@ func generateVMSocket(id string, vmStogarePath string) (interface{}, error) {
|
||||
type hypervisor interface {
|
||||
createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig) error
|
||||
startSandbox(ctx context.Context, timeout int) error
|
||||
stopSandbox(ctx context.Context) error
|
||||
// If wait is set, don't actively stop the sandbox:
|
||||
// just perform cleanup.
|
||||
stopSandbox(ctx context.Context, waitOnly bool) error
|
||||
pauseSandbox(ctx context.Context) error
|
||||
saveSandbox() error
|
||||
resumeSandbox(ctx context.Context) error
|
||||
|
@ -41,7 +41,7 @@ func (m *mockHypervisor) startSandbox(ctx context.Context, timeout int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockHypervisor) stopSandbox(ctx context.Context) error {
|
||||
func (m *mockHypervisor) stopSandbox(ctx context.Context, waitOnly bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -53,7 +53,7 @@ func TestMockHypervisorStartSandbox(t *testing.T) {
|
||||
func TestMockHypervisorStopSandbox(t *testing.T) {
|
||||
var m *mockHypervisor
|
||||
|
||||
assert.NoError(t, m.stopSandbox(context.Background()))
|
||||
assert.NoError(t, m.stopSandbox(context.Background(), false))
|
||||
}
|
||||
|
||||
func TestMockHypervisorAddDevice(t *testing.T) {
|
||||
|
@ -122,6 +122,8 @@ const (
|
||||
scsiControllerID = "scsi0"
|
||||
rngID = "rng0"
|
||||
fallbackFileBackedMemDir = "/dev/shm"
|
||||
|
||||
qemuStopSandboxTimeoutSecs = 15
|
||||
)
|
||||
|
||||
// agnostic list of kernel parameters
|
||||
@ -703,7 +705,7 @@ func (q *qemu) setupVirtiofsd(ctx context.Context) (err error) {
|
||||
q.Logger().Info("virtiofsd quits")
|
||||
// Wait to release resources of virtiofsd process
|
||||
cmd.Process.Wait()
|
||||
q.stopSandbox(ctx)
|
||||
q.stopSandbox(ctx, false)
|
||||
}()
|
||||
return err
|
||||
}
|
||||
@ -933,7 +935,7 @@ func (q *qemu) waitSandbox(ctx context.Context, timeout int) error {
|
||||
}
|
||||
|
||||
// stopSandbox will stop the Sandbox's VM.
|
||||
func (q *qemu) stopSandbox(ctx context.Context) error {
|
||||
func (q *qemu) stopSandbox(ctx context.Context, waitOnly bool) error {
|
||||
span, _ := q.trace(ctx, "stopSandbox")
|
||||
defer span.End()
|
||||
|
||||
@ -965,10 +967,24 @@ func (q *qemu) stopSandbox(ctx context.Context) error {
|
||||
return err
|
||||
}
|
||||
|
||||
err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
|
||||
if err != nil {
|
||||
q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
|
||||
return err
|
||||
if waitOnly {
|
||||
pids := q.getPids()
|
||||
if len(pids) == 0 {
|
||||
return errors.New("cannot determine QEMU PID")
|
||||
}
|
||||
|
||||
pid := pids[0]
|
||||
|
||||
err := utils.WaitLocalProcess(pid, qemuStopSandboxTimeoutSecs, syscall.Signal(0), q.Logger())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
err := q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
|
||||
if err != nil {
|
||||
q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -1027,7 +1027,7 @@ func (s *Sandbox) startVM(ctx context.Context) (err error) {
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
s.hypervisor.stopSandbox(ctx)
|
||||
s.hypervisor.stopSandbox(ctx, false)
|
||||
}
|
||||
}()
|
||||
|
||||
@ -1081,14 +1081,9 @@ func (s *Sandbox) stopVM(ctx context.Context) error {
|
||||
s.Logger().WithError(err).WithField("sandboxid", s.id).Warning("Agent did not stop sandbox")
|
||||
}
|
||||
|
||||
if s.disableVMShutdown {
|
||||
// Do not kill the VM - allow the agent to shut it down
|
||||
// (only used to support static agent tracing).
|
||||
return nil
|
||||
}
|
||||
|
||||
s.Logger().Info("Stopping VM")
|
||||
return s.hypervisor.stopSandbox(ctx)
|
||||
|
||||
return s.hypervisor.stopSandbox(ctx, s.disableVMShutdown)
|
||||
}
|
||||
|
||||
func (s *Sandbox) addContainer(c *Container) error {
|
||||
|
@ -137,7 +137,7 @@ func NewVM(ctx context.Context, config VMConfig) (*VM, error) {
|
||||
defer func() {
|
||||
if err != nil {
|
||||
virtLog.WithField("vm", id).WithError(err).Info("clean up vm")
|
||||
hypervisor.stopSandbox(ctx)
|
||||
hypervisor.stopSandbox(ctx, false)
|
||||
}
|
||||
}()
|
||||
|
||||
@ -251,7 +251,7 @@ func (v *VM) Disconnect(ctx context.Context) error {
|
||||
func (v *VM) Stop(ctx context.Context) error {
|
||||
v.logger().Info("stop vm")
|
||||
|
||||
if err := v.hypervisor.stopSandbox(ctx); err != nil {
|
||||
if err := v.hypervisor.stopSandbox(ctx, false); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user