From 2887e707f871d7c7db95ad277d3fad4b67652f97 Mon Sep 17 00:00:00 2001 From: Andrew Sy Kim Date: Thu, 20 Aug 2020 20:44:52 -0400 Subject: [PATCH] kubelet: respect probe exec timeout from CRI by returning utilexec.CodeExitError This fixes a bug where the exec timeouts are not respected with containerd Exec prober expects a utilexec.CodeExitError on failed probes, otherwise the prober returns 'Unknown' and a non-nil error which the kubelet throws away. As a temporary fix, ExecSync as part of the CRI remote runtime should return utilexec.CodeExitError when the grpc error code is DeadlineContextExceeded. This ensure the exec prober registers exec timeouts as real probe failures to the kubelet. We should also add a TimededError type to k8s.io/utils/exec since it doesn't really make sense to use CodeExitError for exec time outs. Signed-off-by: Andrew Sy Kim --- pkg/kubelet/cri/remote/BUILD | 2 ++ pkg/kubelet/cri/remote/remote_runtime.go | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/pkg/kubelet/cri/remote/BUILD b/pkg/kubelet/cri/remote/BUILD index 5ea0f5db56b..ea28193df68 100644 --- a/pkg/kubelet/cri/remote/BUILD +++ b/pkg/kubelet/cri/remote/BUILD @@ -21,6 +21,8 @@ go_library( "//staging/src/k8s.io/cri-api/pkg/apis:go_default_library", "//staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2:go_default_library", "//vendor/google.golang.org/grpc:go_default_library", + "//vendor/google.golang.org/grpc/codes:go_default_library", + "//vendor/google.golang.org/grpc/status:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library", "//vendor/k8s.io/utils/exec:go_default_library", ], diff --git a/pkg/kubelet/cri/remote/remote_runtime.go b/pkg/kubelet/cri/remote/remote_runtime.go index d51d1876222..80ad3aad193 100644 --- a/pkg/kubelet/cri/remote/remote_runtime.go +++ b/pkg/kubelet/cri/remote/remote_runtime.go @@ -24,6 +24,8 @@ import ( "time" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "k8s.io/klog/v2" "k8s.io/component-base/logs/logreduction" @@ -387,6 +389,17 @@ func (r *remoteRuntimeService) ExecSync(containerID string, cmd []string, timeou resp, err := r.runtimeClient.ExecSync(ctx, req) if err != nil { klog.Errorf("ExecSync %s '%s' from runtime service failed: %v", containerID, strings.Join(cmd, " "), err) + + // If exec timed out, return utilexec.CodeExitError with an exit status as expected + // from prober for failed probes. + // TODO: utilexec should have a TimedoutError type and we should return it here once available. + if status.Code(err) == codes.DeadlineExceeded { + err = utilexec.CodeExitError{ + Err: fmt.Errorf("command %q timed out", strings.Join(cmd, " ")), + Code: 1, // exit code here doesn't really matter, as long as it's not 0 + } + } + return nil, nil, err }