node: device-mgr: sample device plugin: control registration process

Update the sample device plugin to enable the e2e node tests (or any
other entity with full access to the node filesystem) to control the
registration process. We add a new environment variable `REGISTER_CONTROL_FILE`.
The value of this variable must be a file which prevents the plugin
to register itself while it's present. Once removed, the plugin will
go on and complete the registration. The plugin will automatically
detect the parent directory on which the file resides and detect
deletions, unblocking the registration process. If the file is specified
but unaccessible, the plugin will fail. If the file is not specified,
the registration process will progress as usual and never pause.
The plugin will need read access to the parent directory.

This feature is useful because it is not possible to control the order
in which the pods are recovered after node reboot/kubelet restart.

In this approach, the testing environment will create a directory and
then a empty file to pause the registration process of the plugin.
Once pointed to that file, the plugin will start and wait for it to
be deleted. Only after the directory has been deleted,
the plugin would proceed to registration.

This feature is used in #114640 where e2e test is implemented to
simulate scenarios where application pods requesting devices come up before
the device plugin pod on node reboot/ kubelet restart.

Co-authored-by: Francesco Romani <fromani@redhat.com>
Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
This commit is contained in:
Swati Sehgal 2022-12-23 09:49:59 +00:00
parent 015e2fa20c
commit 2c8fc26b89
2 changed files with 65 additions and 1 deletions

View File

@ -1 +1 @@
1.4
1.5

View File

@ -22,6 +22,7 @@ import (
"path/filepath"
"time"
"github.com/fsnotify/fsnotify"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
@ -84,6 +85,7 @@ func main() {
klog.Errorf("Empty pluginSocksDir")
return
}
socketPath := pluginSocksDir + "/dp." + fmt.Sprintf("%d", time.Now().Unix())
dp1 := plugin.NewDevicePluginStub(devs, socketPath, resourceName, false, false)
@ -92,8 +94,70 @@ func main() {
}
dp1.SetAllocFunc(stubAllocFunc)
if registerControlFile := os.Getenv("REGISTER_CONTROL_FILE"); registerControlFile != "" {
if err := handleRegistrationProcess(registerControlFile); err != nil {
panic(err)
}
}
if err := dp1.Register(pluginapi.KubeletSocket, resourceName, pluginapi.DevicePluginPath); err != nil {
panic(err)
}
select {}
}
func handleRegistrationProcess(registerControlFile string) error {
triggerPath := filepath.Dir(registerControlFile)
klog.InfoS("Registration process will be managed explicitly", "triggerPath", triggerPath, "triggerEntry", registerControlFile)
watcher, err := fsnotify.NewWatcher()
if err != nil {
klog.Errorf("Watcher creation failed: %v ", err)
return err
}
defer watcher.Close()
updateCh := make(chan bool)
defer close(updateCh)
go func() {
klog.Infof("Starting watching routine")
for {
select {
case event, ok := <-watcher.Events:
if !ok {
return
}
klog.InfoS("Received event", "name", event.Name, "operation", event.Op)
switch {
case event.Op&fsnotify.Remove == fsnotify.Remove:
if event.Name == registerControlFile {
klog.InfoS("Expected delete", "name", event.Name, "operation", event.Op)
updateCh <- true
return
}
klog.InfoS("Spurious delete", "name", event.Name, "operation", event.Op)
}
case err, ok := <-watcher.Errors:
if !ok {
return
}
klog.Errorf("error: %w", err)
panic(err)
}
}
}()
err = watcher.Add(triggerPath)
if err != nil {
klog.Errorf("Failed to add watch to %q: %w", triggerPath, err)
return err
}
klog.InfoS("Waiting for control file to be deleted", "path", registerControlFile)
<-updateCh
klog.InfoS("Control file was deleted, connecting!")
return nil
}