monitor: Fix monitor race condition doing hypervisor.check()

The thread monitor will check if the agent and the VMM are alive every
second in a blocking thread. The Cloud hypervisor API server is
single-threaded, if the monitor does a `check()`, while a slow request
is still in progress, the monitor check() method will timeout. The
monitor thread will stop all the shim-v2 execution.

This commit modifies the monitor thread to make it check the status of
the hypervisor after 5 seconds. Additionally, the `check()` method from
cloud-hypervisor will use the method `clh.isClhRunning(timeout)` with a
10 seconds timeout. The monitor function does no timeout, so even if
`hypervisor.check()` takes more 10 seconds, the isClhRunning method
handles errors doing a VmmPing and retry in case of errors until the
timeout is reached.

Reduce the time to the next check to 5 should not affect any functionality,
but it will reduce the overhead polling the hypervisor.

Fixes: #2777

Signed-off-by: Carlos Venegas <jose.carlos.venegas.munoz@intel.com>
This commit is contained in:
Carlos Venegas 2021-10-25 20:46:00 +00:00 committed by Carlos Venegas
parent 3d0fe433c6
commit 55412044df
2 changed files with 14 additions and 8 deletions

View File

@ -761,12 +761,18 @@ func (clh *cloudHypervisor) Load(s persistapi.HypervisorState) {
clh.state.apiSocket = s.APISocket
}
func (clh *cloudHypervisor) Check() error {
cl := clh.client()
ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second)
defer cancel()
// Check is the implementation of Check from the Hypervisor interface.
// Check if the VMM API is working.
_, _, err := cl.VmmPingGet(ctx)
func (clh *cloudHypervisor) Check() error {
// Use a long timeout to check if the VMM is running:
// Check is used by the monitor thread(a background thread). If the
// monitor thread calls Check() during the Container boot, it will take
// longer than usual specially if there is a hot-plug request in progress.
running, err := clh.isClhRunning(10)
if !running {
return fmt.Errorf("clh is not running: %s", err)
}
return err
}
@ -1034,8 +1040,6 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
pid := clh.state.PID
// Check if clh process is running, in case it is not, let's
// return from here.
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
return false, nil
}
@ -1048,6 +1052,8 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
_, _, err := cl.VmmPingGet(ctx)
if err == nil {
return true, nil
} else {
clh.Logger().WithError(err).Warning("clh.VmmPingGet API call failed")
}
if time.Since(timeStart).Seconds() > float64(timeout) {

View File

@ -14,7 +14,7 @@ import (
)
const (
defaultCheckInterval = 1 * time.Second
defaultCheckInterval = 5 * time.Second
watcherChannelSize = 128
)