From 7ac399c205c82667d2d948677ce5cb25f4ce33ab Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Wed, 21 Dec 2022 12:55:47 +0000 Subject: [PATCH] node: device-mgr: Handle recovery by checking if healthy devices exist In case of node reboot/kubelet restart, the flow of events involves obtaining the state from the checkpoint file followed by setting the `healthDevices`/`unhealthyDevices` to its zero value. This is done to allow the device plugin to re-register itself so that capacity can be updated appropriately. During the allocation phase, we need to check if the resources requested by the pod have been registered AND healthy devices are present on the node to be allocated. Also we need to move this check above `needed==0` where needed is required - devices allocated to the container (which is obtained from the checkpoint file) because even in cases where no additional devices have to be allocated (as they were pre-allocated), we still need to make the devices that were previously allocated are healthy. Signed-off-by: Swati Sehgal --- pkg/kubelet/cm/devicemanager/manager.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index 8cb57aa8190..88670be639e 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -544,15 +544,24 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", string(podUID), contName, resource, devices.Len(), required) } } + + klog.V(3).InfoS("Need devices to allocate for pod", "deviceNumber", needed, "resourceName", resource, "podUID", string(podUID), "containerName", contName) + healthyDevices, hasRegistered := m.healthyDevices[resource] + + // Check if resource registered with devicemanager + if !hasRegistered { + return nil, fmt.Errorf("can't allocate unregistered device %s", resource) + } + + // Check if registered resource has healthy devices + if healthyDevices.Len() == 0 { + return nil, fmt.Errorf("can't allocate unhealthy devices %s", resource) + } + if needed == 0 { // No change, no work. return nil, nil } - klog.V(3).InfoS("Need devices to allocate for pod", "deviceNumber", needed, "resourceName", resource, "podUID", string(podUID), "containerName", contName) - // Check if resource registered with devicemanager - if _, ok := m.healthyDevices[resource]; !ok { - return nil, fmt.Errorf("can't allocate unregistered device %s", resource) - } // Declare the list of allocated devices. // This will be populated and returned below.