From 92ebe61fea0d1035914538e24add0cada0fe5c32 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Mon, 21 Nov 2022 11:53:27 +0200 Subject: [PATCH 1/4] runtime: reap force killed processes reap child processes after sending SIGKILL Fixes #5739 Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/utils/utils.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 8b05bfc2fc..6197f39b77 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -375,8 +375,19 @@ outer: if pidRunning { // Force process to die if err = syscall.Kill(pid, syscall.SIGKILL); err != nil { + if err == syscall.ESRCH { + logger.WithField("pid", pid).Warnf("process already finished") + return nil + } return fmt.Errorf("Failed to stop process %v: %s", pid, err) } + + for { + _, err := syscall.Wait4(pid, nil, 0, nil) + if err != syscall.EINTR { + break + } + } } return nil From 71491a69c3dae5d1fa9f49a4902ef7c90d5788d3 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Mon, 21 Nov 2022 11:58:49 +0200 Subject: [PATCH 2/4] runtime: move process wait logic to another function extract process wait logic to another function Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/utils/utils.go | 68 +++++++++++------------ 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 6197f39b77..049fb0b362 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -306,6 +306,36 @@ func ConvertAddressFamily(family int32) pbTypes.IPFamily { } } +func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) bool { + secs := time.Duration(timeoutSecs) + timeout := time.After(secs * time.Second) + + for { + // Check if the process is running periodically to avoid a busy loop + // "A watched pot never boils" and an unwaited-for process never appears to die! + waitedPid, err := syscall.Wait4(pid, nil, syscall.WNOHANG, nil) + + if waitedPid == pid && err == nil { + return false + } + + if err := syscall.Kill(pid, syscall.Signal(0)); err != nil { + return false + } + + select { + case <-time.After(50 * time.Millisecond): + case <-timeout: + logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs) + return true + } + } +} + +func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) bool { + return waitProcessUsingWaitLoop(pid, timeoutSecs, logger) +} + // WaitLocalProcess waits for the specified process for up to timeoutSecs seconds. // // Notes: @@ -334,43 +364,7 @@ func WaitLocalProcess(pid int, timeoutSecs uint, initialSignal syscall.Signal, l } } - pidRunning := true - - secs := time.Duration(timeoutSecs) - timeout := time.After(secs * time.Second) - - // Wait for the VM process to terminate -outer: - for { - select { - case <-time.After(50 * time.Millisecond): - // Check if the process is running periodically to avoid a busy loop - - var _status syscall.WaitStatus - var _rusage syscall.Rusage - var waitedPid int - - // "A watched pot never boils" and an unwaited-for process never appears to die! - waitedPid, err = syscall.Wait4(pid, &_status, syscall.WNOHANG, &_rusage) - - if waitedPid == pid && err == nil { - pidRunning = false - break outer - } - - if err = syscall.Kill(pid, syscall.Signal(0)); err != nil { - pidRunning = false - break outer - } - - break - - case <-timeout: - logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs) - - break outer - } - } + pidRunning := waitForProcessCompletion(pid, timeoutSecs, logger) if pidRunning { // Force process to die From e9ba0c11d0e29fcc4777694d22bdadf401315450 Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Mon, 5 Dec 2022 11:42:47 +0200 Subject: [PATCH 3/4] runtime: use exponential backoff for process wait Initial wait period between checks is 1ms, and the next ones are min(wait_period*5, 50ms) Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/utils/utils.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 049fb0b362..9c0526da6d 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -25,6 +25,8 @@ const cpBinaryName = "cp" const fileMode0755 = os.FileMode(0755) +const maxWaitDelay = 50 * time.Millisecond + // The DefaultRateLimiterRefillTime is used for calculating the rate at // which a TokenBucket is replinished, in cases where a RateLimiter is // applied to either network or disk I/O. @@ -307,11 +309,18 @@ func ConvertAddressFamily(family int32) pbTypes.IPFamily { } func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) bool { - secs := time.Duration(timeoutSecs) - timeout := time.After(secs * time.Second) + secs := time.Duration(timeoutSecs) * time.Second + timeout := time.After(secs) + delay := 1 * time.Millisecond for { - // Check if the process is running periodically to avoid a busy loop + // Wait4 is used to reap and check that a child terminated. + // Without the Wait4 call, Kill(0) for a child will always exit without + // error because the process isn't reaped. + // Wait4 return ECHLD error for non-child processes. Kill(0) is meant + // to address this case, once the process is reaped by init process, + // the call will return ESRCH error. + // "A watched pot never boils" and an unwaited-for process never appears to die! waitedPid, err := syscall.Wait4(pid, nil, syscall.WNOHANG, nil) @@ -324,7 +333,12 @@ func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) b } select { - case <-time.After(50 * time.Millisecond): + case <-time.After(delay): + delay = delay * 5 + + if delay > maxWaitDelay { + delay = maxWaitDelay + } case <-timeout: logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs) return true From d04d45ea0509f78f9950f1af402f3b13fb815f8e Mon Sep 17 00:00:00 2001 From: Alexandru Matei Date: Mon, 21 Nov 2022 12:05:43 +0200 Subject: [PATCH 4/4] runtime: use pidfd to wait for processes on Linux Use pidfd_open and poll on newer versions of Linux to wait for the process to exit. For older versions use existing wait logic Fixes: #5617 Signed-off-by: Alexandru Matei --- src/runtime/virtcontainers/utils/utils.go | 4 -- .../virtcontainers/utils/utils_darwin.go | 6 ++ .../virtcontainers/utils/utils_linux.go | 59 +++++++++++++++++++ 3 files changed, 65 insertions(+), 4 deletions(-) diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go index 9c0526da6d..bac62e8cfd 100644 --- a/src/runtime/virtcontainers/utils/utils.go +++ b/src/runtime/virtcontainers/utils/utils.go @@ -346,10 +346,6 @@ func waitProcessUsingWaitLoop(pid int, timeoutSecs uint, logger *logrus.Entry) b } } -func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) bool { - return waitProcessUsingWaitLoop(pid, timeoutSecs, logger) -} - // WaitLocalProcess waits for the specified process for up to timeoutSecs seconds. // // Notes: diff --git a/src/runtime/virtcontainers/utils/utils_darwin.go b/src/runtime/virtcontainers/utils/utils_darwin.go index 54b2124ce1..db1d9ea750 100644 --- a/src/runtime/virtcontainers/utils/utils_darwin.go +++ b/src/runtime/virtcontainers/utils/utils_darwin.go @@ -5,6 +5,12 @@ package utils +import "github.com/sirupsen/logrus" + func GetDevicePathAndFsTypeOptions(mountPoint string) (devicePath, fsType string, fsOptions []string, err error) { return } + +func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) bool { + return waitProcessUsingWaitLoop(pid, timeoutSecs, logger) +} diff --git a/src/runtime/virtcontainers/utils/utils_linux.go b/src/runtime/virtcontainers/utils/utils_linux.go index c223fb93bb..40c5c360ea 100644 --- a/src/runtime/virtcontainers/utils/utils_linux.go +++ b/src/runtime/virtcontainers/utils/utils_linux.go @@ -14,8 +14,10 @@ import ( "os" "strings" "syscall" + "time" "unsafe" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -151,3 +153,60 @@ func IsAPVFIOMediatedDevice(sysfsdev string) bool { } return false } + +func waitProcessUsingPidfd(pid int, timeoutSecs uint, logger *logrus.Entry) (bool, error) { + pidfd, err := unix.PidfdOpen(pid, 0) + + if err != nil { + if err == unix.ESRCH { + return false, nil + } + + return true, err + } + + defer unix.Close(pidfd) + var n int + + maxDelay := time.Duration(timeoutSecs) * time.Second + end := time.Now().Add(maxDelay) + + for { + remaining := time.Until(end).Milliseconds() + if remaining < 0 { + remaining = 0 + } + + n, err = unix.Poll([]unix.PollFd{{Fd: int32(pidfd), Events: unix.POLLIN}}, int(remaining)) + if err != unix.EINTR { + break + } + } + + if err != nil || n != 1 { + logger.Warnf("process %v still running after waiting %ds", pid, timeoutSecs) + return true, err + } + + for { + err := unix.Waitid(unix.P_PIDFD, pidfd, nil, unix.WEXITED, nil) + if err == unix.EINVAL { + err = unix.Waitid(unix.P_PID, pid, nil, unix.WEXITED, nil) + } + + if err != unix.EINTR { + break + } + } + return false, nil +} + +func waitForProcessCompletion(pid int, timeoutSecs uint, logger *logrus.Entry) bool { + pidRunning, err := waitProcessUsingPidfd(pid, timeoutSecs, logger) + + if err == unix.ENOSYS { + pidRunning = waitProcessUsingWaitLoop(pid, timeoutSecs, logger) + } + + return pidRunning +}