diff --git a/pkg/kubelet/cm/dra/plugin/noderesources.go b/pkg/kubelet/cm/dra/plugin/noderesources.go index 3da0aecfa85..741eb17c897 100644 --- a/pkg/kubelet/cm/dra/plugin/noderesources.go +++ b/pkg/kubelet/cm/dra/plugin/noderesources.go @@ -37,6 +37,7 @@ import ( resourceinformers "k8s.io/client-go/informers/resource/v1alpha2" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/flowcontrol" "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3" @@ -46,7 +47,10 @@ import ( const ( // resyncPeriod for informer // TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable? - resyncPeriod = time.Duration(10 * time.Minute) + resyncPeriod = time.Duration(10 * time.Minute) + retryPeriod = 5 * time.Second + maxRetryPeriod = 180 * time.Second + backoffFactor = 2.0 // Introduce a backoff multiplier as jitter factor ) // nodeResourcesController collects resource information from all registered @@ -185,6 +189,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act logger.Info("Stopping to monitor node resources of the plugin", "reason", context.Cause(ctx), "err", ctx.Err(), "recover", r) }() + backOff := flowcontrol.NewBackOffWithJitter(retryPeriod, maxRetryPeriod, backoffFactor) + backOffID := "retry" + // Keep trying until canceled. for ctx.Err() == nil { logger.V(5).Info("Calling NodeListAndWatchResources") @@ -197,9 +204,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act default: // This is a problem, report it and retry. logger.Error(err, "Creating gRPC stream for node resources failed") - // TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff? select { - case <-time.After(5 * time.Second): + case <-time.After(backOff.Get(backOffID)): + backOff.Next(backOffID, time.Now()) case <-ctx.Done(): } } @@ -219,9 +226,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act case ctx.Err() == nil: // This is a problem, report it and retry. logger.Error(err, "Reading node resources from gRPC stream failed") - // TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff? select { - case <-time.After(5 * time.Second): + case <-time.After(backOff.Get(backOffID)): + backOff.Next(backOffID, time.Now()) case <-ctx.Done(): } }