runtime: stop getting OOM events when ttrpc: closed error

getOOMEvents is a long-waiting call, it will retry when failed.
For cases of agent shutdown, the retry should stop.

When the agent hasn't detected agent has died, we can also check
whether the error is "ttrpc: closed".

Fixes: #3815

Signed-off-by: bin <bin@hyper.sh>
This commit is contained in:
bin 2022-03-29 16:39:01 +08:00
parent 9495316145
commit fb8be96194

View File

@ -15,7 +15,6 @@ import (
"github.com/containerd/containerd/api/types/task"
"github.com/containerd/containerd/mount"
"github.com/sirupsen/logrus"
"google.golang.org/grpc/codes"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
)
@ -156,13 +155,11 @@ func watchOOMEvents(ctx context.Context, s *service) {
default:
containerID, err := s.sandbox.GetOOMEvent(ctx)
if err != nil {
shimLog.WithError(err).Warn("failed to get OOM event from sandbox")
// If the GetOOMEvent call is not implemented, then the agent is most likely an older version,
// stop attempting to get OOM events.
// for rust agent, the response code is not found
if isGRPCErrorCode(codes.NotFound, err) || err.Error() == "Dead agent" {
if err.Error() == "ttrpc: closed" || err.Error() == "Dead agent" {
shimLog.WithError(err).Warn("agent has shutdown, return from watching of OOM events")
return
}
shimLog.WithError(err).Warn("failed to get OOM event from sandbox")
time.Sleep(defaultCheckInterval)
continue
}