kubernetes/test/images/sample-device-plugin/sampledeviceplugin.go
Swati Sehgal 2c8fc26b89 node: device-mgr: sample device plugin: control registration process
Update the sample device plugin to enable the e2e node tests (or any
other entity with full access to the node filesystem) to control the
registration process. We add a new environment variable `REGISTER_CONTROL_FILE`.
The value of this variable must be a file which prevents the plugin
to register itself while it's present. Once removed, the plugin will
go on and complete the registration. The plugin will automatically
detect the parent directory on which the file resides and detect
deletions, unblocking the registration process. If the file is specified
but unaccessible, the plugin will fail. If the file is not specified,
the registration process will progress as usual and never pause.
The plugin will need read access to the parent directory.

This feature is useful because it is not possible to control the order
in which the pods are recovered after node reboot/kubelet restart.

In this approach, the testing environment will create a directory and
then a empty file to pause the registration process of the plugin.
Once pointed to that file, the plugin will start and wait for it to
be deleted. Only after the directory has been deleted,
the plugin would proceed to registration.

This feature is used in #114640 where e2e test is implemented to
simulate scenarios where application pods requesting devices come up before
the device plugin pod on node reboot/ kubelet restart.

Co-authored-by: Francesco Romani <fromani@redhat.com>
Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
2023-03-01 10:00:52 +00:00

164 lines
4.4 KiB
Go

/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"fmt"
"os"
"path/filepath"
"time"
"github.com/fsnotify/fsnotify"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
)
const (
resourceName = "example.com/resource"
)
// stubAllocFunc creates and returns allocation response for the input allocate request
func stubAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
var responses pluginapi.AllocateResponse
for _, req := range r.ContainerRequests {
response := &pluginapi.ContainerAllocateResponse{}
for _, requestID := range req.DevicesIDs {
dev, ok := devs[requestID]
if !ok {
return nil, fmt.Errorf("invalid allocation request with non-existing device %s", requestID)
}
if dev.Health != pluginapi.Healthy {
return nil, fmt.Errorf("invalid allocation request with unhealthy device: %s", requestID)
}
// create fake device file
fpath := filepath.Join("/tmp", dev.ID)
// clean first
if err := os.RemoveAll(fpath); err != nil {
return nil, fmt.Errorf("failed to clean fake device file from previous run: %s", err)
}
f, err := os.Create(fpath)
if err != nil && !os.IsExist(err) {
return nil, fmt.Errorf("failed to create fake device file: %s", err)
}
f.Close()
response.Mounts = append(response.Mounts, &pluginapi.Mount{
ContainerPath: fpath,
HostPath: fpath,
})
}
responses.ContainerResponses = append(responses.ContainerResponses, response)
}
return &responses, nil
}
func main() {
devs := []*pluginapi.Device{
{ID: "Dev-1", Health: pluginapi.Healthy},
{ID: "Dev-2", Health: pluginapi.Healthy},
}
pluginSocksDir := os.Getenv("PLUGIN_SOCK_DIR")
klog.Infof("pluginSocksDir: %s", pluginSocksDir)
if pluginSocksDir == "" {
klog.Errorf("Empty pluginSocksDir")
return
}
socketPath := pluginSocksDir + "/dp." + fmt.Sprintf("%d", time.Now().Unix())
dp1 := plugin.NewDevicePluginStub(devs, socketPath, resourceName, false, false)
if err := dp1.Start(); err != nil {
panic(err)
}
dp1.SetAllocFunc(stubAllocFunc)
if registerControlFile := os.Getenv("REGISTER_CONTROL_FILE"); registerControlFile != "" {
if err := handleRegistrationProcess(registerControlFile); err != nil {
panic(err)
}
}
if err := dp1.Register(pluginapi.KubeletSocket, resourceName, pluginapi.DevicePluginPath); err != nil {
panic(err)
}
select {}
}
func handleRegistrationProcess(registerControlFile string) error {
triggerPath := filepath.Dir(registerControlFile)
klog.InfoS("Registration process will be managed explicitly", "triggerPath", triggerPath, "triggerEntry", registerControlFile)
watcher, err := fsnotify.NewWatcher()
if err != nil {
klog.Errorf("Watcher creation failed: %v ", err)
return err
}
defer watcher.Close()
updateCh := make(chan bool)
defer close(updateCh)
go func() {
klog.Infof("Starting watching routine")
for {
select {
case event, ok := <-watcher.Events:
if !ok {
return
}
klog.InfoS("Received event", "name", event.Name, "operation", event.Op)
switch {
case event.Op&fsnotify.Remove == fsnotify.Remove:
if event.Name == registerControlFile {
klog.InfoS("Expected delete", "name", event.Name, "operation", event.Op)
updateCh <- true
return
}
klog.InfoS("Spurious delete", "name", event.Name, "operation", event.Op)
}
case err, ok := <-watcher.Errors:
if !ok {
return
}
klog.Errorf("error: %w", err)
panic(err)
}
}
}()
err = watcher.Add(triggerPath)
if err != nil {
klog.Errorf("Failed to add watch to %q: %w", triggerPath, err)
return err
}
klog.InfoS("Waiting for control file to be deleted", "path", registerControlFile)
<-updateCh
klog.InfoS("Control file was deleted, connecting!")
return nil
}