From 2631b08ff109c433ea79bd8122e31cb0200ca483 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Thu, 10 Nov 2022 14:06:13 +0200 Subject: [PATCH 1/5] clh: don't try to stop clh multiple times Avoid executing StopVM concurrently when virtiofs dies as a result of clh being stopped in StopVM. Fixes: #5622 Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/clh.go | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index a0f08e8cb7..4ea297cbdc 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -24,6 +24,8 @@ import ( "regexp" "strconv" "strings" + "sync" + "sync/atomic" "syscall" "time" @@ -256,6 +258,8 @@ type cloudHypervisor struct { vmconfig chclient.VmConfig state CloudHypervisorState config HypervisorConfig + stopped int32 + mu sync.Mutex } var clhKernelParams = []Param{ @@ -1081,9 +1085,21 @@ func (clh *cloudHypervisor) ResumeVM(ctx context.Context) error { // StopVM will stop the Sandbox's VM. func (clh *cloudHypervisor) StopVM(ctx context.Context, waitOnly bool) (err error) { + clh.mu.Lock() + defer func() { + if err == nil { + atomic.StoreInt32(&clh.stopped, 1) + } + clh.mu.Unlock() + }() span, _ := katatrace.Trace(ctx, clh.Logger(), "StopVM", clhTracingTags, map[string]string{"sandbox_id": clh.id}) defer span.End() clh.Logger().WithField("function", "StopVM").Info("Stop Sandbox") + if atomic.LoadInt32(&clh.stopped) != 0 { + clh.Logger().Info("Already stopped") + return nil + } + return clh.terminate(ctx, waitOnly) } From 9ef68e0c7adc75e73e9a1b2827313c26c7551f51 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Thu, 10 Nov 2022 14:08:14 +0200 Subject: [PATCH 2/5] clh: fast exit from isClhRunning if the process was stopped Use atomic operations instead of acquiring a mutex in isClhRunning. This stops isClhRunning from generating a deadlock by trying to reacquire an already-acquired lock when called via StopVM->terminate. Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/clh.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 4ea297cbdc..3319f7b912 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -1401,6 +1401,10 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) { pid := clh.state.PID + if atomic.LoadInt32(&clh.stopped) != 0 { + return false, nil + } + if err := syscall.Kill(pid, syscall.Signal(0)); err != nil { return false, nil } From 0e3ac66e761f8d41c91b5d07c985cedfa40790c0 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Tue, 8 Nov 2022 21:05:22 +0200 Subject: [PATCH 3/5] clh: return faster with dead clh process from isClhRunning Through proactively checking if Cloud Hypervisor process is dead, this patch provides a faster path for isClhRunning Fixes: #5623 Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/clh.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 3319f7b912..090e074755 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -1405,16 +1405,16 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) { return false, nil } - if err := syscall.Kill(pid, syscall.Signal(0)); err != nil { - return false, nil - } - timeStart := time.Now() cl := clh.client() for { + err := syscall.Kill(pid, syscall.Signal(0)) + if err != nil { + return false, nil + } ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second) - defer cancel() - _, _, err := cl.VmmPingGet(ctx) + _, _, err = cl.VmmPingGet(ctx) + cancel() if err == nil { return true, nil } else { From 7e481f217987d8b6835449a2a6b4ec85ba704efa Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Tue, 8 Nov 2022 21:14:53 +0200 Subject: [PATCH 4/5] qemu: set stopped only if StopVM is successful Fixes: #5624 Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/qemu.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index b7f9601c13..bea7fdeba8 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -969,7 +969,7 @@ func (q *qemu) waitVM(ctx context.Context, timeout int) error { } // StopVM will stop the Sandbox's VM. -func (q *qemu) StopVM(ctx context.Context, waitOnly bool) error { +func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) { q.mu.Lock() defer q.mu.Unlock() span, _ := katatrace.Trace(ctx, q.Logger(), "StopVM", qemuTracingTags, map[string]string{"sandbox_id": q.id}) @@ -983,7 +983,9 @@ func (q *qemu) StopVM(ctx context.Context, waitOnly bool) error { defer func() { q.cleanupVM() - q.stopped = true + if err == nil { + q.stopped = true + } }() if q.config.Debug && q.qemuConfig.LogFile != "" { From a04afab74d6216d8fb9de3c30b5aa77b9f3ddbf5 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Tue, 8 Nov 2022 21:17:41 +0200 Subject: [PATCH 5/5] qemu: early exit from Check if the process was stopped Fixes: #5625 Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/qemu.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index bea7fdeba8..f15f62d13c 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -21,6 +21,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "syscall" "time" "unsafe" @@ -109,7 +110,7 @@ type qemu struct { nvdimmCount int - stopped bool + stopped int32 mu sync.Mutex } @@ -976,7 +977,7 @@ func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) { defer span.End() q.Logger().Info("Stopping Sandbox") - if q.stopped { + if atomic.LoadInt32(&q.stopped) != 0 { q.Logger().Info("Already stopped") return nil } @@ -984,7 +985,7 @@ func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) { defer func() { q.cleanupVM() if err == nil { - q.stopped = true + atomic.StoreInt32(&q.stopped, 1) } }() @@ -2570,7 +2571,7 @@ func (q *qemu) toGrpc(ctx context.Context) ([]byte, error) { func (q *qemu) Save() (s hv.HypervisorState) { // If QEMU isn't even running, there isn't any state to Save - if q.stopped { + if atomic.LoadInt32(&q.stopped) != 0 { return } @@ -2621,6 +2622,10 @@ func (q *qemu) Load(s hv.HypervisorState) { } func (q *qemu) Check() error { + if atomic.LoadInt32(&q.stopped) != 0 { + return fmt.Errorf("qemu is not running") + } + q.memoryDumpFlag.Lock() defer q.memoryDumpFlag.Unlock()