From 2e86db78cf5ced72268a691196b4d49e349ebf58 Mon Sep 17 00:00:00 2001
From: Feng Wang <feng.wang@databricks.com>
Date: Fri, 11 Mar 2022 10:44:26 -0800
Subject: [PATCH] runtime: Properly handle ESRCH error when signaling container

Currently kata shim v2 doesn't translate ESRCH signal, causing container
fail to stop and shim leak.

Fixes: #3874

Signed-off-by: Feng Wang <feng.wang@databricks.com>
(cherry picked from commit aa5ae6b17c688a3fa22c28d1ec3a71a6cabb9c57)
---
 src/runtime/virtcontainers/container.go   | 14 +++++++++++++-
 src/runtime/virtcontainers/utils/utils.go |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/runtime/virtcontainers/container.go b/src/runtime/virtcontainers/container.go
index d482f2a129..8b031dba80 100644
--- a/src/runtime/virtcontainers/container.go
+++ b/src/runtime/virtcontainers/container.go
@@ -14,6 +14,7 @@ import (
 	"os"
 	"path/filepath"
 	"strconv"
+	"strings"
 	"syscall"
 	"time"
 
@@ -1144,7 +1145,18 @@ func (c *Container) signalProcess(ctx context.Context, processID string, signal
 		return fmt.Errorf("Container not ready, running or paused, impossible to signal the container")
 	}
 
-	return c.sandbox.agent.signalProcess(ctx, c, processID, signal, all)
+	// kill(2) method can return ESRCH in certain cases, which is not handled by containerd cri server in container_stop.go.
+	// CRIO server also doesn't handle ESRCH. So kata runtime will swallow it here.
+	var err error
+	if err = c.sandbox.agent.signalProcess(ctx, c, processID, signal, all); err != nil &&
+		strings.Contains(err.Error(), "ESRCH: No such process") {
+		c.Logger().WithFields(logrus.Fields{
+			"container":  c.id,
+			"process-id": processID,
+		}).Warn("signal encounters ESRCH, process already finished")
+		return nil
+	}
+	return err
 }
 
 func (c *Container) winsizeProcess(ctx context.Context, processID string, height, width uint32) error {
diff --git a/src/runtime/virtcontainers/utils/utils.go b/src/runtime/virtcontainers/utils/utils.go
index caf5965858..35a7eaf7a9 100644
--- a/src/runtime/virtcontainers/utils/utils.go
+++ b/src/runtime/virtcontainers/utils/utils.go
@@ -321,6 +321,7 @@ func WaitLocalProcess(pid int, timeoutSecs uint, initialSignal syscall.Signal, l
 	if initialSignal != syscall.Signal(0) {
 		if err = syscall.Kill(pid, initialSignal); err != nil {
 			if err == syscall.ESRCH {
+				logger.WithField("pid", pid).Warnf("kill encounters ESRCH, process already finished")
 				return nil
 			}