From 55412044df6ecc50259665be2ca16a754cb9a8e7 Mon Sep 17 00:00:00 2001 From: Carlos Venegas Date: Mon, 25 Oct 2021 20:46:00 +0000 Subject: [PATCH] monitor: Fix monitor race condition doing hypervisor.check() The thread monitor will check if the agent and the VMM are alive every second in a blocking thread. The Cloud hypervisor API server is single-threaded, if the monitor does a `check()`, while a slow request is still in progress, the monitor check() method will timeout. The monitor thread will stop all the shim-v2 execution. This commit modifies the monitor thread to make it check the status of the hypervisor after 5 seconds. Additionally, the `check()` method from cloud-hypervisor will use the method `clh.isClhRunning(timeout)` with a 10 seconds timeout. The monitor function does no timeout, so even if `hypervisor.check()` takes more 10 seconds, the isClhRunning method handles errors doing a VmmPing and retry in case of errors until the timeout is reached. Reduce the time to the next check to 5 should not affect any functionality, but it will reduce the overhead polling the hypervisor. Fixes: #2777 Signed-off-by: Carlos Venegas --- src/runtime/virtcontainers/clh.go | 20 +++++++++++++------- src/runtime/virtcontainers/monitor.go | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 591efa5aa0..279e8be386 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -761,12 +761,18 @@ func (clh *cloudHypervisor) Load(s persistapi.HypervisorState) { clh.state.apiSocket = s.APISocket } -func (clh *cloudHypervisor) Check() error { - cl := clh.client() - ctx, cancel := context.WithTimeout(context.Background(), clhAPITimeout*time.Second) - defer cancel() +// Check is the implementation of Check from the Hypervisor interface. +// Check if the VMM API is working. - _, _, err := cl.VmmPingGet(ctx) +func (clh *cloudHypervisor) Check() error { + // Use a long timeout to check if the VMM is running: + // Check is used by the monitor thread(a background thread). If the + // monitor thread calls Check() during the Container boot, it will take + // longer than usual specially if there is a hot-plug request in progress. + running, err := clh.isClhRunning(10) + if !running { + return fmt.Errorf("clh is not running: %s", err) + } return err } @@ -1034,8 +1040,6 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) { pid := clh.state.PID - // Check if clh process is running, in case it is not, let's - // return from here. if err := syscall.Kill(pid, syscall.Signal(0)); err != nil { return false, nil } @@ -1048,6 +1052,8 @@ func (clh *cloudHypervisor) isClhRunning(timeout uint) (bool, error) { _, _, err := cl.VmmPingGet(ctx) if err == nil { return true, nil + } else { + clh.Logger().WithError(err).Warning("clh.VmmPingGet API call failed") } if time.Since(timeStart).Seconds() > float64(timeout) { diff --git a/src/runtime/virtcontainers/monitor.go b/src/runtime/virtcontainers/monitor.go index c711f95568..ea56dac048 100644 --- a/src/runtime/virtcontainers/monitor.go +++ b/src/runtime/virtcontainers/monitor.go @@ -14,7 +14,7 @@ import ( ) const ( - defaultCheckInterval = 1 * time.Second + defaultCheckInterval = 5 * time.Second watcherChannelSize = 128 )