node: device-mgr: explicitly check if pre-allocated devices are healthy

Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
This commit is contained in:
Swati Sehgal 2023-02-03 10:55:34 +00:00
parent a799ffb571
commit 5b2a3dbbdc
2 changed files with 24 additions and 0 deletions

View File

@ -558,6 +558,11 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
return nil, fmt.Errorf("can't allocate unhealthy devices %s", resource) return nil, fmt.Errorf("can't allocate unhealthy devices %s", resource)
} }
// Check if all the previously allocated devices are healthy
if !healthyDevices.IsSuperset(devices) {
return nil, fmt.Errorf("previously allocated devices are no longer healthy; can't allocate unhealthy devices %s", resource)
}
if needed == 0 { if needed == 0 {
// No change, no work. // No change, no work.
return nil, nil return nil, nil

View File

@ -988,6 +988,7 @@ func TestPodContainerDeviceAllocation(t *testing.T) {
func TestPodContainerDeviceToAllocate(t *testing.T) { func TestPodContainerDeviceToAllocate(t *testing.T) {
resourceName1 := "domain1.com/resource1" resourceName1 := "domain1.com/resource1"
resourceName2 := "domain2.com/resource2" resourceName2 := "domain2.com/resource2"
resourceName3 := "domain2.com/resource3"
as := require.New(t) as := require.New(t)
tmpDir, err := os.MkdirTemp("", "checkpoint") tmpDir, err := os.MkdirTemp("", "checkpoint")
as.Nil(err) as.Nil(err)
@ -1010,11 +1011,19 @@ func TestPodContainerDeviceToAllocate(t *testing.T) {
checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}}, checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}},
constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"}, constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{})) map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
testManager.podDevices.insert("pod3", "con3", resourceName3,
checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}},
constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
// no healthy devices for resourceName1 and devices corresponding to // no healthy devices for resourceName1 and devices corresponding to
// resource2 are intentionally omitted to simulate that the resource // resource2 are intentionally omitted to simulate that the resource
// hasn't been registered. // hasn't been registered.
testManager.healthyDevices[resourceName1] = sets.NewString() testManager.healthyDevices[resourceName1] = sets.NewString()
testManager.healthyDevices[resourceName3] = sets.NewString()
// dev5 is no longer in the list of healthy devices
testManager.healthyDevices[resourceName3].Insert("dev7")
testManager.healthyDevices[resourceName3].Insert("dev8")
testCases := []struct { testCases := []struct {
description string description string
@ -1046,6 +1055,16 @@ func TestPodContainerDeviceToAllocate(t *testing.T) {
expectedAllocatedDevices: nil, expectedAllocatedDevices: nil,
expErr: fmt.Errorf("can't allocate unregistered device %s", resourceName2), expErr: fmt.Errorf("can't allocate unregistered device %s", resourceName2),
}, },
{
description: "Admission error in case resource not devices previously allocated no longer healthy",
podUID: "pod3",
contName: "con3",
resource: resourceName3,
required: 1,
reusableDevices: sets.NewString(),
expectedAllocatedDevices: nil,
expErr: fmt.Errorf("previously allocated devices are no longer healthy; can't allocate unhealthy devices %s", resourceName3),
},
} }
for _, testCase := range testCases { for _, testCase := range testCases {