node: sample-dp: Add retry to handle device plugin restart failure

Add retry mechanism to handle cases where after kubelet restarts, the device
plugin unix socket(s) were created but not ready to serve yet.

Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
This commit is contained in:
Swati Sehgal 2023-09-22 13:47:19 +01:00
parent d0d133298d
commit 9a354fc9d0

View File

@ -217,11 +217,22 @@ func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) {
case event := <-m.kubeletRestartWatcher.Events: case event := <-m.kubeletRestartWatcher.Events:
if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create { if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create {
klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint) klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint)
if err := m.Restart(); err != nil { var lastErr error
klog.ErrorS(err, "Unable to restart server")
panic(err)
err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) {
restartErr := m.Restart()
if restartErr == nil {
return true, nil
} }
klog.ErrorS(restartErr, "Retrying after error")
lastErr = restartErr
return false, nil
})
if err != nil {
klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error())
panic(err)
}
if ok := m.registerControlFunc(); ok { if ok := m.registerControlFunc(); ok {
if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil { if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil {
klog.ErrorS(err, "Unable to register to kubelet") klog.ErrorS(err, "Unable to register to kubelet")