mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-21 10:51:29 +00:00
node: device-mgr: Handle recovery by checking if healthy devices exist
In case of node reboot/kubelet restart, the flow of events involves obtaining the state from the checkpoint file followed by setting the `healthDevices`/`unhealthyDevices` to its zero value. This is done to allow the device plugin to re-register itself so that capacity can be updated appropriately. During the allocation phase, we need to check if the resources requested by the pod have been registered AND healthy devices are present on the node to be allocated. Also we need to move this check above `needed==0` where needed is required - devices allocated to the container (which is obtained from the checkpoint file) because even in cases where no additional devices have to be allocated (as they were pre-allocated), we still need to make the devices that were previously allocated are healthy. Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
This commit is contained in:
parent
b6acf6f805
commit
7ac399c205
@ -544,15 +544,24 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", string(podUID), contName, resource, devices.Len(), required)
|
||||
}
|
||||
}
|
||||
|
||||
klog.V(3).InfoS("Need devices to allocate for pod", "deviceNumber", needed, "resourceName", resource, "podUID", string(podUID), "containerName", contName)
|
||||
healthyDevices, hasRegistered := m.healthyDevices[resource]
|
||||
|
||||
// Check if resource registered with devicemanager
|
||||
if !hasRegistered {
|
||||
return nil, fmt.Errorf("can't allocate unregistered device %s", resource)
|
||||
}
|
||||
|
||||
// Check if registered resource has healthy devices
|
||||
if healthyDevices.Len() == 0 {
|
||||
return nil, fmt.Errorf("can't allocate unhealthy devices %s", resource)
|
||||
}
|
||||
|
||||
if needed == 0 {
|
||||
// No change, no work.
|
||||
return nil, nil
|
||||
}
|
||||
klog.V(3).InfoS("Need devices to allocate for pod", "deviceNumber", needed, "resourceName", resource, "podUID", string(podUID), "containerName", contName)
|
||||
// Check if resource registered with devicemanager
|
||||
if _, ok := m.healthyDevices[resource]; !ok {
|
||||
return nil, fmt.Errorf("can't allocate unregistered device %s", resource)
|
||||
}
|
||||
|
||||
// Declare the list of allocated devices.
|
||||
// This will be populated and returned below.
|
||||
|
Loading…
Reference in New Issue
Block a user