mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-04-29 12:14:48 +00:00
monitor: Fix monitor race condition doing hypervisor.check()
The thread monitor will check if the agent and the VMM are alive every second in a blocking thread. The Cloud hypervisor API server is single-threaded, if the monitor does a `check()`, while a slow request is still in progress, the monitor check() method will timeout. The monitor thread will stop all the shim-v2 execution. This commit modifies the monitor thread to make it check the status of the hypervisor after 5 seconds. Additionally, the `check()` method from cloud-hypervisor will use the method `clh.isClhRunning(timeout)` with a 10 seconds timeout. The monitor function does no timeout, so even if `hypervisor.check()` takes more 10 seconds, the isClhRunning method handles errors doing a VmmPing and retry in case of errors until the timeout is reached. Reduce the time to the next check to 5 should not affect any functionality, but it will reduce the overhead polling the hypervisor. Fixes: #2777 Signed-off-by: Carlos Venegas <jose.carlos.venegas.munoz@intel.com>
This commit is contained in:
parent
3d0fe433c6
commit
55412044df
@ -761,12 +761,18 @@ func (clh *cloudHypervisor) Load(s persistapi.HypervisorState) {
|
||||
clh.state.apiSocket = s.APISocket
|
||||
}
|
||||
|
||||
func (clh *cloudHypervisor) Check() error {
|
||||
cl := clh.client()
|
||||
ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second)
|
||||
defer cancel()
|
||||
// Check is the implementation of Check from the Hypervisor interface.
|
||||
// Check if the VMM API is working.
|
||||
|
||||
_, _, err := cl.VmmPingGet(ctx)
|
||||
func (clh *cloudHypervisor) Check() error {
|
||||
// Use a long timeout to check if the VMM is running:
|
||||
// Check is used by the monitor thread(a background thread). If the
|
||||
// monitor thread calls Check() during the Container boot, it will take
|
||||
// longer than usual specially if there is a hot-plug request in progress.
|
||||
running, err := clh.isClhRunning(10)
|
||||
if !running {
|
||||
return fmt.Errorf("clh is not running: %s", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
@ -1034,8 +1040,6 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
|
||||
|
||||
pid := clh.state.PID
|
||||
|
||||
// Check if clh process is running, in case it is not, let's
|
||||
// return from here.
|
||||
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
|
||||
return false, nil
|
||||
}
|
||||
@ -1048,6 +1052,8 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
|
||||
_, _, err := cl.VmmPingGet(ctx)
|
||||
if err == nil {
|
||||
return true, nil
|
||||
} else {
|
||||
clh.Logger().WithError(err).Warning("clh.VmmPingGet API call failed")
|
||||
}
|
||||
|
||||
if time.Since(timeStart).Seconds() > float64(timeout) {
|
||||
|
@ -14,7 +14,7 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
defaultCheckInterval = 1 * time.Second
|
||||
defaultCheckInterval = 5 * time.Second
|
||||
watcherChannelSize = 128
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user