Merge pull request #45110 from smarterclayton/offset_timeouts

Automatic merge from submit-queue (batch tested with PRs 45110, 45148) Make timeouts in the Kubelet slightly offset to aid debugging Several of these loops overlap, and when they are the reason a failure is happening it is difficult to sort them out. Slighly misalign these loops to make their impact obvious. We are seeing exactly 2 minute pod worker timeouts in a wide range of test flake scenarios, and I want to be confident we know exactly which one is the culprit.
2025-09-16 22:53:22 +00:00 · 2017-05-01 05:42:14 -07:00
parent d3f6209523 49209b3394
commit 6480bc70b0
5 changed files with 15 additions and 5 deletions
--- a/pkg/kubelet/dockertools/kube_docker_client.go
+++ b/pkg/kubelet/dockertools/kube_docker_client.go
@@ -70,7 +70,9 @@ var _ DockerInterface = &kubeDockerClient{}
 // kubeDockerClient only applies timeout on non-long running operations.
 const (
 	// defaultTimeout is the default timeout of short running docker operations.
-	defaultTimeout = 2 * time.Minute
+	// Value is slightly offset from 2 minutes to make timeouts due to this
+	// constant recognizable.
+	defaultTimeout = 2*time.Minute - 1*time.Second

 	// defaultShmSize is the default ShmSize to use (in bytes) if not specified.
 	defaultShmSize = int64(1024 * 1024 * 64)
--- a/pkg/kubelet/rkt/rkt.go
+++ b/pkg/kubelet/rkt/rkt.go
@@ -135,7 +135,9 @@ const (
 	defaultNetworkName = "rkt.kubernetes.io"

 	// defaultRequestTimeout is the default timeout of rkt requests.
-	defaultRequestTimeout = 2 * time.Minute
+	// Value is slightly offset from 2 minutes to make timeouts due to this
+	// constant recognizable.
+	defaultRequestTimeout = 2*time.Minute - 1*time.Second

 	etcHostsPath      = "/etc/hosts"
 	etcResolvConfPath = "/etc/resolv.conf"
--- a/pkg/kubelet/volumemanager/volume_manager.go
+++ b/pkg/kubelet/volumemanager/volume_manager.go
@@ -71,7 +71,9 @@ const (
 	// will retry in the next sync iteration. This frees the associated
 	// goroutine of the pod to process newer updates if needed (e.g., a delete
 	// request to the pod).
-	podAttachAndMountTimeout time.Duration = 2 * time.Minute
+	// Value is slightly offset from 2 minutes to make timeouts due to this
+	// constant recognizable.
+	podAttachAndMountTimeout time.Duration = 2*time.Minute + 3*time.Second

 	// podAttachAndMountRetryInterval is the amount of time the GetVolumesForPod
 	// call waits before retrying
--- a/pkg/util/goroutinemap/exponentialbackoff/exponential_backoff.go
+++ b/pkg/util/goroutinemap/exponentialbackoff/exponential_backoff.go
@@ -32,7 +32,9 @@ const (

 	// maxDurationBeforeRetry is the maximum amount of time that
 	// durationBeforeRetry will grow to due to exponential backoff.
-	maxDurationBeforeRetry time.Duration = 2 * time.Minute
+	// Value is slightly offset from 2 minutes to make timeouts due to this
+	// constant recognizable.
+	maxDurationBeforeRetry time.Duration = 2*time.Minute + 2*time.Second
 )

 // ExponentialBackoff contains the last occurrence of an error and the duration
--- a/pkg/util/goroutinemap/goroutinemap.go
+++ b/pkg/util/goroutinemap/goroutinemap.go
@@ -40,7 +40,9 @@ const (

 	// maxDurationBeforeRetry is the maximum amount of time that
 	// durationBeforeRetry will grow to due to exponential backoff.
-	maxDurationBeforeRetry = 2 * time.Minute
+	// Value is slightly offset from 2 minutes to make timeouts due to this
+	// constant recognizable.
+	maxDurationBeforeRetry = 2*time.Minute + 1*time.Second
 )

 // GoRoutineMap defines a type that can run named goroutines and track their