Merge pull request #5597 from UiPath/fix-clh-wait

clh: avoid race condition when stopping clh
This commit is contained in:
Greg Kurz 2022-11-16 07:39:27 +01:00 committed by GitHub
commit 1bbcb413c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 8 deletions

View File

@ -24,6 +24,8 @@ import (
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
@ -256,6 +258,8 @@ type cloudHypervisor struct {
vmconfig chclient.VmConfig
state CloudHypervisorState
config HypervisorConfig
stopped int32
mu sync.Mutex
}
var clhKernelParams = []Param{
@ -1081,9 +1085,21 @@ func (clh *cloudHypervisor) ResumeVM(ctx context.Context) error {
// StopVM will stop the Sandbox's VM.
func (clh *cloudHypervisor) StopVM(ctx context.Context, waitOnly bool) (err error) {
clh.mu.Lock()
defer func() {
if err == nil {
atomic.StoreInt32(&clh.stopped, 1)
}
clh.mu.Unlock()
}()
span, _ := katatrace.Trace(ctx, clh.Logger(), "StopVM", clhTracingTags, map[string]string{"sandbox_id": clh.id})
defer span.End()
clh.Logger().WithField("function", "StopVM").Info("Stop Sandbox")
if atomic.LoadInt32(&clh.stopped) != 0 {
clh.Logger().Info("Already stopped")
return nil
}
return clh.terminate(ctx, waitOnly)
}
@ -1385,16 +1401,20 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
pid := clh.state.PID
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
if atomic.LoadInt32(&clh.stopped) != 0 {
return false, nil
}
timeStart := time.Now()
cl := clh.client()
for {
err := syscall.Kill(pid, syscall.Signal(0))
if err != nil {
return false, nil
}
ctx, cancel := context.WithTimeout(context.Background(), clh.getClhAPITimeout()*time.Second)
defer cancel()
_, _, err := cl.VmmPingGet(ctx)
_, _, err = cl.VmmPingGet(ctx)
cancel()
if err == nil {
return true, nil
} else {

View File

@ -21,6 +21,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"unsafe"
@ -109,7 +110,7 @@ type qemu struct {
nvdimmCount int
stopped bool
stopped int32
mu sync.Mutex
}
@ -969,21 +970,23 @@ func (q *qemu) waitVM(ctx context.Context, timeout int) error {
}
// StopVM will stop the Sandbox's VM.
func (q *qemu) StopVM(ctx context.Context, waitOnly bool) error {
func (q *qemu) StopVM(ctx context.Context, waitOnly bool) (err error) {
q.mu.Lock()
defer q.mu.Unlock()
span, _ := katatrace.Trace(ctx, q.Logger(), "StopVM", qemuTracingTags, map[string]string{"sandbox_id": q.id})
defer span.End()
q.Logger().Info("Stopping Sandbox")
if q.stopped {
if atomic.LoadInt32(&q.stopped) != 0 {
q.Logger().Info("Already stopped")
return nil
}
defer func() {
q.cleanupVM()
q.stopped = true
if err == nil {
atomic.StoreInt32(&q.stopped, 1)
}
}()
if q.config.Debug && q.qemuConfig.LogFile != "" {
@ -2568,7 +2571,7 @@ func (q *qemu) toGrpc(ctx context.Context) ([]byte, error) {
func (q *qemu) Save() (s hv.HypervisorState) {
// If QEMU isn't even running, there isn't any state to Save
if q.stopped {
if atomic.LoadInt32(&q.stopped) != 0 {
return
}
@ -2619,6 +2622,10 @@ func (q *qemu) Load(s hv.HypervisorState) {
}
func (q *qemu) Check() error {
if atomic.LoadInt32(&q.stopped) != 0 {
return fmt.Errorf("qemu is not running")
}
q.memoryDumpFlag.Lock()
defer q.memoryDumpFlag.Unlock()