diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 53c8bdf19c3..8bcf72ecdbc 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -668,6 +668,14 @@ const ( // alpha: v1.21 // LoadBalancerIPMode enables the IPMode field in the LoadBalancerIngress status of a Service LoadBalancerIPMode featuregate.Feature = "LoadBalancerIPMode" + + // owner: @andrewsykim @SergeyKanzhelev + // GA: v1.20 + // + // Ensure kubelet respects exec probe timeouts. Feature gate exists in-case existing workloads + // may depend on old behavior where exec probe timeouts were ignored. + // Lock to default in v1.21 and remove in v1.22. + ExecProbeTimeout featuregate.Feature = "ExecProbeTimeout" ) func init() { @@ -769,6 +777,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS RootCAConfigMap: {Default: true, PreRelease: featuregate.Beta}, SizeMemoryBackedVolumes: {Default: false, PreRelease: featuregate.Alpha}, LoadBalancerIPMode: {Default: false, PreRelease: featuregate.Alpha}, + ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default in v1.21 and remove in v1.22 // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/kubelet/cri/remote/BUILD b/pkg/kubelet/cri/remote/BUILD index 5ea0f5db56b..d6cd6b7cf5b 100644 --- a/pkg/kubelet/cri/remote/BUILD +++ b/pkg/kubelet/cri/remote/BUILD @@ -17,10 +17,13 @@ go_library( importpath = "k8s.io/kubernetes/pkg/kubelet/cri/remote", deps = [ "//pkg/kubelet/cri/remote/util:go_default_library", + "//pkg/probe/exec:go_default_library", "//staging/src/k8s.io/component-base/logs/logreduction:go_default_library", "//staging/src/k8s.io/cri-api/pkg/apis:go_default_library", "//staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2:go_default_library", "//vendor/google.golang.org/grpc:go_default_library", + "//vendor/google.golang.org/grpc/codes:go_default_library", + "//vendor/google.golang.org/grpc/status:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library", "//vendor/k8s.io/utils/exec:go_default_library", ], diff --git a/pkg/kubelet/cri/remote/remote_runtime.go b/pkg/kubelet/cri/remote/remote_runtime.go index d51d1876222..87b41bbf16f 100644 --- a/pkg/kubelet/cri/remote/remote_runtime.go +++ b/pkg/kubelet/cri/remote/remote_runtime.go @@ -24,12 +24,15 @@ import ( "time" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" "k8s.io/klog/v2" "k8s.io/component-base/logs/logreduction" internalapi "k8s.io/cri-api/pkg/apis" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2" "k8s.io/kubernetes/pkg/kubelet/cri/remote/util" + "k8s.io/kubernetes/pkg/probe/exec" utilexec "k8s.io/utils/exec" ) @@ -387,6 +390,12 @@ func (r *remoteRuntimeService) ExecSync(containerID string, cmd []string, timeou resp, err := r.runtimeClient.ExecSync(ctx, req) if err != nil { klog.Errorf("ExecSync %s '%s' from runtime service failed: %v", containerID, strings.Join(cmd, " "), err) + + // interpret DeadlineExceeded gRPC errors as timedout probes + if status.Code(err) == codes.DeadlineExceeded { + err = exec.NewTimeoutError(fmt.Errorf("command %q timed out", strings.Join(cmd, " ")), timeout) + } + return nil, nil, err } diff --git a/pkg/kubelet/cri/.import-restrictions b/pkg/kubelet/cri/streaming/.import-restrictions similarity index 100% rename from pkg/kubelet/cri/.import-restrictions rename to pkg/kubelet/cri/streaming/.import-restrictions diff --git a/pkg/kubelet/dockershim/BUILD b/pkg/kubelet/dockershim/BUILD index 06e3c04e599..4bd4958e11e 100644 --- a/pkg/kubelet/dockershim/BUILD +++ b/pkg/kubelet/dockershim/BUILD @@ -37,6 +37,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/credentialprovider:go_default_library", + "//pkg/features:go_default_library", "//pkg/kubelet/apis/config:go_default_library", "//pkg/kubelet/checkpointmanager:go_default_library", "//pkg/kubelet/checkpointmanager/checksum:go_default_library", @@ -55,11 +56,13 @@ go_library( "//pkg/kubelet/types:go_default_library", "//pkg/kubelet/util/cache:go_default_library", "//pkg/kubelet/util/ioutils:go_default_library", + "//pkg/probe/exec:go_default_library", "//pkg/util/parsers:go_default_library", "//staging/src/k8s.io/api/core/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/types:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//staging/src/k8s.io/client-go/tools/remotecommand:go_default_library", "//staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2:go_default_library", "//vendor/github.com/armon/circbuf:go_default_library", diff --git a/pkg/kubelet/dockershim/exec.go b/pkg/kubelet/dockershim/exec.go index 31e00bda18b..d79cff31d97 100644 --- a/pkg/kubelet/dockershim/exec.go +++ b/pkg/kubelet/dockershim/exec.go @@ -21,13 +21,17 @@ package dockershim import ( "fmt" "io" + "strings" "time" dockertypes "github.com/docker/docker/api/types" "k8s.io/klog/v2" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/tools/remotecommand" + "k8s.io/kubernetes/pkg/features" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/probe/exec" "k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker" ) @@ -110,29 +114,48 @@ func (*NativeExecHandler) ExecInContainer(client libdocker.Interface, container return err } + // if ExecProbeTimeout feature gate is disabled, preserve existing behavior to ignore exec timeouts + var execTimeout <-chan time.Time + if timeout > 0 && utilfeature.DefaultFeatureGate.Enabled(features.ExecProbeTimeout) { + execTimeout = time.After(timeout) + } else { + // skip exec timeout if provided timeout is 0 + execTimeout = nil + } + ticker := time.NewTicker(2 * time.Second) defer ticker.Stop() count := 0 for { - inspect, err2 := client.InspectExec(execObj.ID) - if err2 != nil { - return err2 - } - if !inspect.Running { - if inspect.ExitCode != 0 { - err = &dockerExitError{inspect} + select { + case <-execTimeout: + return exec.NewTimeoutError(fmt.Errorf("command %q timed out", strings.Join(cmd, " ")), timeout) + // need to use "default" here instead of <-ticker.C, otherwise we delay the initial InspectExec by 2 seconds. + default: + inspect, inspectErr := client.InspectExec(execObj.ID) + if inspectErr != nil { + return inspectErr } - break - } - count++ - if count == 5 { - klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID) - break - } + if !inspect.Running { + if inspect.ExitCode != 0 { + return &dockerExitError{inspect} + } - <-ticker.C + return nil + } + + // Only limit the amount of InspectExec calls if the exec timeout was not set. + // When a timeout is not set, we stop polling the exec session after 5 attempts and allow the process to continue running. + if execTimeout == nil { + count++ + if count == 5 { + klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID) + return nil + } + } + + <-ticker.C + } } - - return err } diff --git a/pkg/probe/exec/BUILD b/pkg/probe/exec/BUILD index ce381191610..80186df8ca9 100644 --- a/pkg/probe/exec/BUILD +++ b/pkg/probe/exec/BUILD @@ -8,11 +8,16 @@ load( go_library( name = "go_default_library", - srcs = ["exec.go"], + srcs = [ + "errors.go", + "exec.go", + ], importpath = "k8s.io/kubernetes/pkg/probe/exec", deps = [ + "//pkg/features:go_default_library", "//pkg/kubelet/util/ioutils:go_default_library", "//pkg/probe:go_default_library", + "//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/klog/v2:go_default_library", "//vendor/k8s.io/utils/exec:go_default_library", ], diff --git a/pkg/probe/exec/errors.go b/pkg/probe/exec/errors.go new file mode 100644 index 00000000000..6659a94ac7b --- /dev/null +++ b/pkg/probe/exec/errors.go @@ -0,0 +1,47 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package exec + +import ( + "time" +) + +// NewTimeoutError returns a new TimeoutError. +func NewTimeoutError(err error, timeout time.Duration) *TimeoutError { + return &TimeoutError{ + err: err, + timeout: timeout, + } +} + +// TimeoutError is an error returned on exec probe timeouts. It should be returned by CRI implementations +// in order for the exec prober to interpret exec timeouts as failed probes. +// TODO: this error type can likely be removed when we support CRI errors. +type TimeoutError struct { + err error + timeout time.Duration +} + +// Error returns the error string. +func (t *TimeoutError) Error() string { + return t.err.Error() +} + +// Timeout returns the timeout duration of the exec probe. +func (t *TimeoutError) Timeout() time.Duration { + return t.timeout +} diff --git a/pkg/probe/exec/exec.go b/pkg/probe/exec/exec.go index 9aa894672ad..164d21bd112 100644 --- a/pkg/probe/exec/exec.go +++ b/pkg/probe/exec/exec.go @@ -19,6 +19,8 @@ package exec import ( "bytes" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/util/ioutils" "k8s.io/kubernetes/pkg/probe" @@ -66,6 +68,16 @@ func (pr execProber) Probe(e exec.Cmd) (probe.Result, string, error) { } return probe.Failure, string(data), nil } + + timeoutErr, ok := err.(*TimeoutError) + if ok { + if utilfeature.DefaultFeatureGate.Enabled(features.ExecProbeTimeout) { + return probe.Failure, string(data), nil + } + + klog.Warningf("Exec probe timed out after %s but ExecProbeTimeout feature gate was disabled", timeoutErr.Timeout()) + } + return probe.Unknown, "", err } return probe.Success, string(data), nil diff --git a/test/e2e/common/container_probe.go b/test/e2e/common/container_probe.go index 6fb540f4b9e..d85cb6d1a3d 100644 --- a/test/e2e/common/container_probe.go +++ b/test/e2e/common/container_probe.go @@ -32,7 +32,6 @@ import ( "k8s.io/kubernetes/test/e2e/framework" e2eevents "k8s.io/kubernetes/test/e2e/framework/events" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" - e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" testutils "k8s.io/kubernetes/test/utils" "github.com/onsi/ginkgo" @@ -213,8 +212,6 @@ var _ = framework.KubeDescribe("Probing container", func() { Description: A Pod is created with liveness probe with a Exec action on the Pod. If the liveness probe call does not return within the timeout specified, liveness probe MUST restart the Pod. */ ginkgo.It("should be restarted with a docker exec liveness probe with timeout ", func() { - // TODO: enable this test once the default exec handler supports timeout. - e2eskipper.Skipf("The default exec handler, dockertools.NativeExecHandler, does not support timeouts due to a limitation in the Docker Remote API") cmd := []string{"/bin/sh", "-c", "sleep 600"} livenessProbe := &v1.Probe{ Handler: execHandler([]string{"/bin/sh", "-c", "sleep 10"}), @@ -226,6 +223,23 @@ var _ = framework.KubeDescribe("Probing container", func() { RunLivenessTest(f, pod, 1, defaultObservationTimeout) }) + /* + Release: v1.20 + Testname: Pod readiness probe, docker exec, not ready + Description: A Pod is created with readiness probe with a Exec action on the Pod. If the readiness probe call does not return within the timeout specified, readiness probe MUST not be Ready. + */ + ginkgo.It("should not be ready with a docker exec readiness probe timeout ", func() { + cmd := []string{"/bin/sh", "-c", "sleep 600"} + readinessProbe := &v1.Probe{ + Handler: execHandler([]string{"/bin/sh", "-c", "sleep 10"}), + InitialDelaySeconds: 15, + TimeoutSeconds: 1, + FailureThreshold: 1, + } + pod := busyBoxPodSpec(readinessProbe, nil, cmd) + runReadinessFailTest(f, pod, time.Minute) + }) + /* Release: v1.14 Testname: Pod http liveness probe, redirected to a local address @@ -625,3 +639,35 @@ func RunLivenessTest(f *framework.Framework, pod *v1.Pod, expectNumRestarts int, ns, pod.Name, expectNumRestarts, observedRestarts) } } + +func runReadinessFailTest(f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration) { + podClient := f.PodClient() + ns := f.Namespace.Name + gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty()) + + // At the end of the test, clean up by removing the pod. + defer func() { + ginkgo.By("deleting the pod") + podClient.Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0)) + }() + ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns)) + podClient.Create(pod) + + // Wait until the pod is not pending. (Here we need to check for something other than + // 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.) + framework.ExpectNoError(e2epod.WaitForPodNotPending(f.ClientSet, ns, pod.Name), + fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns)) + framework.Logf("Started pod %s in namespace %s", pod.Name, ns) + + // Wait for the not ready state to be true for notReadyUntil duration + deadline := time.Now().Add(notReadyUntil) + for start := time.Now(); time.Now().Before(deadline); time.Sleep(2 * time.Second) { + // poll for Not Ready + if podutil.IsPodReady(pod) { + framework.Failf("pod %s/%s - expected to be not ready", ns, pod.Name) + } + + framework.Logf("pod %s/%s is not ready (%v elapsed)", + ns, pod.Name, time.Since(start)) + } +}