From e9ba0c11d0e29fcc4777694d22bdadf401315450 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Mon, 5 Dec 2022 11:42:47 +0200 Subject: [PATCH] runtime: use exponential backoff for process wait Initial wait period between checks is 1ms, and the next ones are min(wait_period*5, 50ms) Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/utils/utils.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 049fb0b362..9c0526da6d 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -25,6 +25,8 @@ const cpBinaryName = "cp" const fileMode0755 = os.FileMode(0755) +const maxWaitDelay = 50 * time.Millisecond + // The DefaultRateLimiterRefillTime is used for calculating the rate at // which a TokenBucket is replinished, in cases where a RateLimiter is // applied to either network or disk I/O. @@ -307,11 +309,18 @@ func ConvertAddressFamily(family int32) pbTypes.IPFamily { } func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) bool { - secs := time.Duration(timeoutSecs) - timeout := time.After(secs * time.Second) + secs := time.Duration(timeoutSecs) * time.Second + timeout := time.After(secs) + delay := 1 * time.Millisecond for { - // Check if the process is running periodically to avoid a busy loop + // Wait4 is used to reap and check that a child terminated. + // Without the Wait4 call, Kill(0) for a child will always exit without + // error because the process isn't reaped. + // Wait4 return ECHLD error for non-child processes. Kill(0) is meant + // to address this case, once the process is reaped by init process, + // the call will return ESRCH error. + // "A watched pot never boils" and an unwaited-for process never appears to die! waitedPid, err := syscall.Wait4(pid, nil, syscall.WNOHANG, nil) @@ -324,7 +333,12 @@ func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) b } select { - case <-time.After(50 * time.Millisecond): + case <-time.After(delay): + delay = delay * 5 + + if delay > maxWaitDelay { + delay = maxWaitDelay + } case <-timeout: logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs) return true