From 9a354fc9d03f800d6014543841fc7cb2762aab22 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 22 Sep 2023 13:47:19 +0100 Subject: [PATCH] node: sample-dp: Add retry to handle device plugin restart failure Add retry mechanism to handle cases where after kubelet restarts, the device plugin unix socket(s) were created but not ready to serve yet. Signed-off-by: Swati Sehgal --- .../cm/devicemanager/plugin/v1beta1/stub.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/stub.go b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/stub.go index d590e99fa2b..fbec3456e46 100644 --- a/pkg/kubelet/cm/devicemanager/plugin/v1beta1/stub.go +++ b/pkg/kubelet/cm/devicemanager/plugin/v1beta1/stub.go @@ -217,11 +217,22 @@ func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) { case event := <-m.kubeletRestartWatcher.Events: if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create { klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint) - if err := m.Restart(); err != nil { - klog.ErrorS(err, "Unable to restart server") - panic(err) + var lastErr error + err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) { + restartErr := m.Restart() + if restartErr == nil { + return true, nil + } + klog.ErrorS(restartErr, "Retrying after error") + lastErr = restartErr + return false, nil + }) + if err != nil { + klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error()) + panic(err) } + if ok := m.registerControlFunc(); ok { if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil { klog.ErrorS(err, "Unable to register to kubelet")