Merge pull request #94115 from andrewsykim/fix-dockershim-exec

kubelet: respect exec probe timeouts
This commit is contained in:
Kubernetes Prow Robot 2020-11-09 16:08:55 -08:00 committed by GitHub
commit fe37798329
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 178 additions and 21 deletions

View File

@ -668,6 +668,14 @@ const (
// alpha: v1.21
// LoadBalancerIPMode enables the IPMode field in the LoadBalancerIngress status of a Service
LoadBalancerIPMode featuregate.Feature = "LoadBalancerIPMode"
// owner: @andrewsykim @SergeyKanzhelev
// GA: v1.20
//
// Ensure kubelet respects exec probe timeouts. Feature gate exists in-case existing workloads
// may depend on old behavior where exec probe timeouts were ignored.
// Lock to default in v1.21 and remove in v1.22.
ExecProbeTimeout featuregate.Feature = "ExecProbeTimeout"
)
func init() {
@ -769,6 +777,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
RootCAConfigMap: {Default: true, PreRelease: featuregate.Beta},
SizeMemoryBackedVolumes: {Default: false, PreRelease: featuregate.Alpha},
LoadBalancerIPMode: {Default: false, PreRelease: featuregate.Alpha},
ExecProbeTimeout: {Default: true, PreRelease: featuregate.GA}, // lock to default in v1.21 and remove in v1.22
// inherited features from generic apiserver, relisted here to get a conflict if it is changed
// unintentionally on either side:

View File

@ -17,10 +17,13 @@ go_library(
importpath = "k8s.io/kubernetes/pkg/kubelet/cri/remote",
deps = [
"//pkg/kubelet/cri/remote/util:go_default_library",
"//pkg/probe/exec:go_default_library",
"//staging/src/k8s.io/component-base/logs/logreduction:go_default_library",
"//staging/src/k8s.io/cri-api/pkg/apis:go_default_library",
"//staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2:go_default_library",
"//vendor/google.golang.org/grpc:go_default_library",
"//vendor/google.golang.org/grpc/codes:go_default_library",
"//vendor/google.golang.org/grpc/status:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library",
"//vendor/k8s.io/utils/exec:go_default_library",
],

View File

@ -24,12 +24,15 @@ import (
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"k8s.io/klog/v2"
"k8s.io/component-base/logs/logreduction"
internalapi "k8s.io/cri-api/pkg/apis"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
"k8s.io/kubernetes/pkg/kubelet/cri/remote/util"
"k8s.io/kubernetes/pkg/probe/exec"
utilexec "k8s.io/utils/exec"
)
@ -387,6 +390,12 @@ func (r *remoteRuntimeService) ExecSync(containerID string, cmd []string, timeou
resp, err := r.runtimeClient.ExecSync(ctx, req)
if err != nil {
klog.Errorf("ExecSync %s '%s' from runtime service failed: %v", containerID, strings.Join(cmd, " "), err)
// interpret DeadlineExceeded gRPC errors as timedout probes
if status.Code(err) == codes.DeadlineExceeded {
err = exec.NewTimeoutError(fmt.Errorf("command %q timed out", strings.Join(cmd, " ")), timeout)
}
return nil, nil, err
}

View File

@ -37,6 +37,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/credentialprovider:go_default_library",
"//pkg/features:go_default_library",
"//pkg/kubelet/apis/config:go_default_library",
"//pkg/kubelet/checkpointmanager:go_default_library",
"//pkg/kubelet/checkpointmanager/checksum:go_default_library",
@ -55,11 +56,13 @@ go_library(
"//pkg/kubelet/types:go_default_library",
"//pkg/kubelet/util/cache:go_default_library",
"//pkg/kubelet/util/ioutils:go_default_library",
"//pkg/probe/exec:go_default_library",
"//pkg/util/parsers:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/tools/remotecommand:go_default_library",
"//staging/src/k8s.io/cri-api/pkg/apis/runtime/v1alpha2:go_default_library",
"//vendor/github.com/armon/circbuf:go_default_library",

View File

@ -21,13 +21,17 @@ package dockershim
import (
"fmt"
"io"
"strings"
"time"
dockertypes "github.com/docker/docker/api/types"
"k8s.io/klog/v2"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/tools/remotecommand"
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/probe/exec"
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
)
@ -110,29 +114,48 @@ func (*NativeExecHandler) ExecInContainer(client libdocker.Interface, container
return err
}
// if ExecProbeTimeout feature gate is disabled, preserve existing behavior to ignore exec timeouts
var execTimeout <-chan time.Time
if timeout > 0 && utilfeature.DefaultFeatureGate.Enabled(features.ExecProbeTimeout) {
execTimeout = time.After(timeout)
} else {
// skip exec timeout if provided timeout is 0
execTimeout = nil
}
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
count := 0
for {
inspect, err2 := client.InspectExec(execObj.ID)
if err2 != nil {
return err2
}
if !inspect.Running {
if inspect.ExitCode != 0 {
err = &dockerExitError{inspect}
select {
case <-execTimeout:
return exec.NewTimeoutError(fmt.Errorf("command %q timed out", strings.Join(cmd, " ")), timeout)
// need to use "default" here instead of <-ticker.C, otherwise we delay the initial InspectExec by 2 seconds.
default:
inspect, inspectErr := client.InspectExec(execObj.ID)
if inspectErr != nil {
return inspectErr
}
break
}
count++
if count == 5 {
klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID)
break
}
if !inspect.Running {
if inspect.ExitCode != 0 {
return &dockerExitError{inspect}
}
<-ticker.C
return nil
}
// Only limit the amount of InspectExec calls if the exec timeout was not set.
// When a timeout is not set, we stop polling the exec session after 5 attempts and allow the process to continue running.
if execTimeout == nil {
count++
if count == 5 {
klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID)
return nil
}
}
<-ticker.C
}
}
return err
}

View File

@ -8,11 +8,16 @@ load(
go_library(
name = "go_default_library",
srcs = ["exec.go"],
srcs = [
"errors.go",
"exec.go",
],
importpath = "k8s.io/kubernetes/pkg/probe/exec",
deps = [
"//pkg/features:go_default_library",
"//pkg/kubelet/util/ioutils:go_default_library",
"//pkg/probe:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library",
"//vendor/k8s.io/utils/exec:go_default_library",
],

47
pkg/probe/exec/errors.go Normal file
View File

@ -0,0 +1,47 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package exec
import (
"time"
)
// NewTimeoutError returns a new TimeoutError.
func NewTimeoutError(err error, timeout time.Duration) *TimeoutError {
return &TimeoutError{
err: err,
timeout: timeout,
}
}
// TimeoutError is an error returned on exec probe timeouts. It should be returned by CRI implementations
// in order for the exec prober to interpret exec timeouts as failed probes.
// TODO: this error type can likely be removed when we support CRI errors.
type TimeoutError struct {
err error
timeout time.Duration
}
// Error returns the error string.
func (t *TimeoutError) Error() string {
return t.err.Error()
}
// Timeout returns the timeout duration of the exec probe.
func (t *TimeoutError) Timeout() time.Duration {
return t.timeout
}

View File

@ -19,6 +19,8 @@ package exec
import (
"bytes"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/util/ioutils"
"k8s.io/kubernetes/pkg/probe"
@ -66,6 +68,16 @@ func (pr execProber) Probe(e exec.Cmd) (probe.Result, string, error) {
}
return probe.Failure, string(data), nil
}
timeoutErr, ok := err.(*TimeoutError)
if ok {
if utilfeature.DefaultFeatureGate.Enabled(features.ExecProbeTimeout) {
return probe.Failure, string(data), nil
}
klog.Warningf("Exec probe timed out after %s but ExecProbeTimeout feature gate was disabled", timeoutErr.Timeout())
}
return probe.Unknown, "", err
}
return probe.Success, string(data), nil

View File

@ -32,7 +32,6 @@ import (
"k8s.io/kubernetes/test/e2e/framework"
e2eevents "k8s.io/kubernetes/test/e2e/framework/events"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
testutils "k8s.io/kubernetes/test/utils"
"github.com/onsi/ginkgo"
@ -213,8 +212,6 @@ var _ = framework.KubeDescribe("Probing container", func() {
Description: A Pod is created with liveness probe with a Exec action on the Pod. If the liveness probe call does not return within the timeout specified, liveness probe MUST restart the Pod.
*/
ginkgo.It("should be restarted with a docker exec liveness probe with timeout ", func() {
// TODO: enable this test once the default exec handler supports timeout.
e2eskipper.Skipf("The default exec handler, dockertools.NativeExecHandler, does not support timeouts due to a limitation in the Docker Remote API")
cmd := []string{"/bin/sh", "-c", "sleep 600"}
livenessProbe := &v1.Probe{
Handler: execHandler([]string{"/bin/sh", "-c", "sleep 10"}),
@ -226,6 +223,23 @@ var _ = framework.KubeDescribe("Probing container", func() {
RunLivenessTest(f, pod, 1, defaultObservationTimeout)
})
/*
Release: v1.20
Testname: Pod readiness probe, docker exec, not ready
Description: A Pod is created with readiness probe with a Exec action on the Pod. If the readiness probe call does not return within the timeout specified, readiness probe MUST not be Ready.
*/
ginkgo.It("should not be ready with a docker exec readiness probe timeout ", func() {
cmd := []string{"/bin/sh", "-c", "sleep 600"}
readinessProbe := &v1.Probe{
Handler: execHandler([]string{"/bin/sh", "-c", "sleep 10"}),
InitialDelaySeconds: 15,
TimeoutSeconds: 1,
FailureThreshold: 1,
}
pod := busyBoxPodSpec(readinessProbe, nil, cmd)
runReadinessFailTest(f, pod, time.Minute)
})
/*
Release: v1.14
Testname: Pod http liveness probe, redirected to a local address
@ -625,3 +639,35 @@ func RunLivenessTest(f *framework.Framework, pod *v1.Pod, expectNumRestarts int,
ns, pod.Name, expectNumRestarts, observedRestarts)
}
}
func runReadinessFailTest(f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration) {
podClient := f.PodClient()
ns := f.Namespace.Name
gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty())
// At the end of the test, clean up by removing the pod.
defer func() {
ginkgo.By("deleting the pod")
podClient.Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0))
}()
ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns))
podClient.Create(pod)
// Wait until the pod is not pending. (Here we need to check for something other than
// 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.)
framework.ExpectNoError(e2epod.WaitForPodNotPending(f.ClientSet, ns, pod.Name),
fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns))
framework.Logf("Started pod %s in namespace %s", pod.Name, ns)
// Wait for the not ready state to be true for notReadyUntil duration
deadline := time.Now().Add(notReadyUntil)
for start := time.Now(); time.Now().Before(deadline); time.Sleep(2 * time.Second) {
// poll for Not Ready
if podutil.IsPodReady(pod) {
framework.Failf("pod %s/%s - expected to be not ready", ns, pod.Name)
}
framework.Logf("pod %s/%s is not ready (%v elapsed)",
ns, pod.Name, time.Since(start))
}
}