Merge pull request #2778 from jcvenegas/clh-race-condition-check

clh: Fix race condition that prevent start pods
This commit is contained in:
Carlos Venegas 2021-11-16 14:15:06 -06:00 committed by GitHub
commit 15b5d22e81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 8 deletions

View File

@ -761,12 +761,18 @@ func (clh *cloudHypervisor) Load(s persistapi.HypervisorState) {
clh.state.apiSocket = s.APISocket
}
func (clh *cloudHypervisor) Check() error {
cl := clh.client()
ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second)
defer cancel()
// Check is the implementation of Check from the Hypervisor interface.
// Check if the VMM API is working.
_, _, err := cl.VmmPingGet(ctx)
func (clh *cloudHypervisor) Check() error {
// Use a long timeout to check if the VMM is running:
// Check is used by the monitor thread(a background thread). If the
// monitor thread calls Check() during the Container boot, it will take
// longer than usual specially if there is a hot-plug request in progress.
running, err := clh.isClhRunning(10)
if !running {
return fmt.Errorf("clh is not running: %s", err)
}
return err
}
@ -1034,8 +1040,6 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
pid := clh.state.PID
// Check if clh process is running, in case it is not, let's
// return from here.
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
return false, nil
}
@ -1048,6 +1052,8 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
_, _, err := cl.VmmPingGet(ctx)
if err == nil {
return true, nil
} else {
clh.Logger().WithError(err).Warning("clh.VmmPingGet API call failed")
}
if time.Since(timeStart).Seconds() > float64(timeout) {

View File

@ -14,7 +14,7 @@ import (
)
const (
defaultCheckInterval = 1 * time.Second
defaultCheckInterval = 5 * time.Second
watcherChannelSize = 128
)