From 2c8fc26b89bee89f28c07608255dca163308123b Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 23 Dec 2022 09:49:59 +0000 Subject: [PATCH 1/2] node: device-mgr: sample device plugin: control registration process Update the sample device plugin to enable the e2e node tests (or any other entity with full access to the node filesystem) to control the registration process. We add a new environment variable `REGISTER_CONTROL_FILE`. The value of this variable must be a file which prevents the plugin to register itself while it's present. Once removed, the plugin will go on and complete the registration. The plugin will automatically detect the parent directory on which the file resides and detect deletions, unblocking the registration process. If the file is specified but unaccessible, the plugin will fail. If the file is not specified, the registration process will progress as usual and never pause. The plugin will need read access to the parent directory. This feature is useful because it is not possible to control the order in which the pods are recovered after node reboot/kubelet restart. In this approach, the testing environment will create a directory and then a empty file to pause the registration process of the plugin. Once pointed to that file, the plugin will start and wait for it to be deleted. Only after the directory has been deleted, the plugin would proceed to registration. This feature is used in #114640 where e2e test is implemented to simulate scenarios where application pods requesting devices come up before the device plugin pod on node reboot/ kubelet restart. Co-authored-by: Francesco Romani Signed-off-by: Swati Sehgal --- test/images/sample-device-plugin/VERSION | 2 +- .../sampledeviceplugin.go | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/test/images/sample-device-plugin/VERSION b/test/images/sample-device-plugin/VERSION index c068b2447cc..c239c60cba2 100644 --- a/test/images/sample-device-plugin/VERSION +++ b/test/images/sample-device-plugin/VERSION @@ -1 +1 @@ -1.4 +1.5 diff --git a/test/images/sample-device-plugin/sampledeviceplugin.go b/test/images/sample-device-plugin/sampledeviceplugin.go index 824b19788cf..fd372133bc8 100644 --- a/test/images/sample-device-plugin/sampledeviceplugin.go +++ b/test/images/sample-device-plugin/sampledeviceplugin.go @@ -22,6 +22,7 @@ import ( "path/filepath" "time" + "github.com/fsnotify/fsnotify" "k8s.io/klog/v2" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1" @@ -84,6 +85,7 @@ func main() { klog.Errorf("Empty pluginSocksDir") return } + socketPath := pluginSocksDir + "/dp." + fmt.Sprintf("%d", time.Now().Unix()) dp1 := plugin.NewDevicePluginStub(devs, socketPath, resourceName, false, false) @@ -92,8 +94,70 @@ func main() { } dp1.SetAllocFunc(stubAllocFunc) + + if registerControlFile := os.Getenv("REGISTER_CONTROL_FILE"); registerControlFile != "" { + if err := handleRegistrationProcess(registerControlFile); err != nil { + panic(err) + } + } + if err := dp1.Register(pluginapi.KubeletSocket, resourceName, pluginapi.DevicePluginPath); err != nil { panic(err) } select {} } + +func handleRegistrationProcess(registerControlFile string) error { + triggerPath := filepath.Dir(registerControlFile) + + klog.InfoS("Registration process will be managed explicitly", "triggerPath", triggerPath, "triggerEntry", registerControlFile) + + watcher, err := fsnotify.NewWatcher() + if err != nil { + klog.Errorf("Watcher creation failed: %v ", err) + return err + } + + defer watcher.Close() + updateCh := make(chan bool) + defer close(updateCh) + + go func() { + klog.Infof("Starting watching routine") + for { + select { + case event, ok := <-watcher.Events: + if !ok { + return + } + klog.InfoS("Received event", "name", event.Name, "operation", event.Op) + switch { + case event.Op&fsnotify.Remove == fsnotify.Remove: + if event.Name == registerControlFile { + klog.InfoS("Expected delete", "name", event.Name, "operation", event.Op) + updateCh <- true + return + } + klog.InfoS("Spurious delete", "name", event.Name, "operation", event.Op) + } + case err, ok := <-watcher.Errors: + if !ok { + return + } + klog.Errorf("error: %w", err) + panic(err) + } + } + }() + + err = watcher.Add(triggerPath) + if err != nil { + klog.Errorf("Failed to add watch to %q: %w", triggerPath, err) + return err + } + + klog.InfoS("Waiting for control file to be deleted", "path", registerControlFile) + <-updateCh + klog.InfoS("Control file was deleted, connecting!") + return nil +} From 7ea35d0cd88d22fffda4b7bdf3b1dcc4627d9c92 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Fri, 23 Dec 2022 18:50:42 +0000 Subject: [PATCH 2/2] node: device-mgr: sample device plugin: manifest to avoid registration Signed-off-by: Swati Sehgal --- test/e2e/testing-manifests/embed.go | 2 +- ...le-device-plugin-control-registration.yaml | 52 +++++++++++++++++++ .../sample-device-plugin.yaml | 0 test/e2e_node/util_sampledevice.go | 3 +- 4 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin-control-registration.yaml rename test/e2e/testing-manifests/{ => sample-device-plugin}/sample-device-plugin.yaml (100%) diff --git a/test/e2e/testing-manifests/embed.go b/test/e2e/testing-manifests/embed.go index a763b2fc87c..bc0e9070fc9 100644 --- a/test/e2e/testing-manifests/embed.go +++ b/test/e2e/testing-manifests/embed.go @@ -22,7 +22,7 @@ import ( e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles" ) -//go:embed cluster-dns flexvolume guestbook kubectl sample-device-plugin.yaml scheduling/nvidia-driver-installer.yaml statefulset storage-csi +//go:embed cluster-dns flexvolume guestbook kubectl sample-device-plugin scheduling/nvidia-driver-installer.yaml statefulset storage-csi var e2eTestingManifestsFS embed.FS func GetE2ETestingManifestsFS() e2etestfiles.EmbeddedFileSource { diff --git a/test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin-control-registration.yaml b/test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin-control-registration.yaml new file mode 100644 index 00000000000..54cf07c46e7 --- /dev/null +++ b/test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin-control-registration.yaml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: sample-device-plugin-beta + namespace: kube-system + labels: + k8s-app: sample-device-plugin +spec: + selector: + matchLabels: + k8s-app: sample-device-plugin + template: + metadata: + labels: + k8s-app: sample-device-plugin + annotations: + spec: + priorityClassName: system-node-critical + tolerations: + - operator: "Exists" + effect: "NoExecute" + - operator: "Exists" + effect: "NoSchedule" + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: plugins-registry-probe-mode + hostPath: + path: /var/lib/kubelet/plugins_registry + - name: dev + hostPath: + path: /dev + containers: + - image: registry.k8s.io/e2e-test-images/sample-device-plugin:1.5 + name: sample-device-plugin + env: + - name: PLUGIN_SOCK_DIR + value: "/var/lib/kubelet/device-plugins" + - name: REGISTER_CONTROL_FILE + value: "/var/lib/kubelet/device-plugins/sample/registration" + securityContext: + privileged: true + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: plugins-registry-probe-mode + mountPath: /var/lib/kubelet/plugins_registry + - name: dev + mountPath: /dev + updateStrategy: + type: RollingUpdate diff --git a/test/e2e/testing-manifests/sample-device-plugin.yaml b/test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin.yaml similarity index 100% rename from test/e2e/testing-manifests/sample-device-plugin.yaml rename to test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin.yaml diff --git a/test/e2e_node/util_sampledevice.go b/test/e2e_node/util_sampledevice.go index a1e22ee761e..eca7418164c 100644 --- a/test/e2e_node/util_sampledevice.go +++ b/test/e2e_node/util_sampledevice.go @@ -22,7 +22,8 @@ import ( const ( // SampleDevicePluginDSYAML is the path of the daemonset template of the sample device plugin. // TODO: Parametrize it by making it a feature in TestFramework. - SampleDevicePluginDSYAML = "test/e2e/testing-manifests/sample-device-plugin.yaml" + SampleDevicePluginDSYAML = "test/e2e/testing-manifests/sample-device-plugin.yaml" + SampleDevicePluginControlRegistrationDSYAML = "test/e2e/testing-manifests/sample-device-plugin/sample-device-plugin-control-registration.yaml" // SampleDevicePluginName is the name of the device plugin pod SampleDevicePluginName = "sample-device-plugin"