mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-04-30 20:54:26 +00:00
monitor: Fix monitor race condition doing hypervisor.check()
The thread monitor will check if the agent and the VMM are alive every second in a blocking thread. The Cloud hypervisor API server is single-threaded, if the monitor does a `check()`, while a slow request is still in progress, the monitor check() method will timeout. The monitor thread will stop all the shim-v2 execution. This commit modifies the monitor thread to make it check the status of the hypervisor after 5 seconds. Additionally, the `check()` method from cloud-hypervisor will use the method `clh.isClhRunning(timeout)` with a 10 seconds timeout. The monitor function does no timeout, so even if `hypervisor.check()` takes more 10 seconds, the isClhRunning method handles errors doing a VmmPing and retry in case of errors until the timeout is reached. Reduce the time to the next check to 5 should not affect any functionality, but it will reduce the overhead polling the hypervisor. Fixes: #2777 Signed-off-by: Carlos Venegas <jose.carlos.venegas.munoz@intel.com>
This commit is contained in:
parent
3d0fe433c6
commit
55412044df
@ -761,12 +761,18 @@ func (clh *cloudHypervisor) Load(s persistapi.HypervisorState) {
|
|||||||
clh.state.apiSocket = s.APISocket
|
clh.state.apiSocket = s.APISocket
|
||||||
}
|
}
|
||||||
|
|
||||||
func (clh *cloudHypervisor) Check() error {
|
// Check is the implementation of Check from the Hypervisor interface.
|
||||||
cl := clh.client()
|
// Check if the VMM API is working.
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
_, _, err := cl.VmmPingGet(ctx)
|
func (clh *cloudHypervisor) Check() error {
|
||||||
|
// Use a long timeout to check if the VMM is running:
|
||||||
|
// Check is used by the monitor thread(a background thread). If the
|
||||||
|
// monitor thread calls Check() during the Container boot, it will take
|
||||||
|
// longer than usual specially if there is a hot-plug request in progress.
|
||||||
|
running, err := clh.isClhRunning(10)
|
||||||
|
if !running {
|
||||||
|
return fmt.Errorf("clh is not running: %s", err)
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1034,8 +1040,6 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
|
|||||||
|
|
||||||
pid := clh.state.PID
|
pid := clh.state.PID
|
||||||
|
|
||||||
// Check if clh process is running, in case it is not, let's
|
|
||||||
// return from here.
|
|
||||||
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
|
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
@ -1048,6 +1052,8 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) {
|
|||||||
_, _, err := cl.VmmPingGet(ctx)
|
_, _, err := cl.VmmPingGet(ctx)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return true, nil
|
return true, nil
|
||||||
|
} else {
|
||||||
|
clh.Logger().WithError(err).Warning("clh.VmmPingGet API call failed")
|
||||||
}
|
}
|
||||||
|
|
||||||
if time.Since(timeStart).Seconds() > float64(timeout) {
|
if time.Since(timeStart).Seconds() > float64(timeout) {
|
||||||
|
@ -14,7 +14,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
defaultCheckInterval = 1 * time.Second
|
defaultCheckInterval = 5 * time.Second
|
||||||
watcherChannelSize = 128
|
watcherChannelSize = 128
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user