clh: Rely on Cloud Hypervisor for generating the device ID

We're currently hitting a race condition on the Cloud Hypervisor's
driver code when quickly removing and adding a block device.

This happens because the device removal is an asynchronous operation,
and we currently do *not* monitor events coming from Cloud Hypervisor to
know when the device was actually removed.  Together with this, the
sandbox code doesn't know about that and when a new device is attached
it'll quickly assign what may be the very same ID to the new device,
leading to the Cloud Hypervisor's driver trying to hotplug a device with
the very same ID of the device that was not yet removed.

This is, in a nutshell, why the tests with Cloud Hypervisor and
devmapper have been failing every now and then.

The workaround taken to solve the issue is basically *not* passing down
the device ID to Cloud Hypervisor and simply letting Cloud Hypervisor
itself generate those, as Cloud Hypervisor does it in a manner that
avoids such conflicts.  With this addition we have then to keep a map of
the device ID and the Cloud Hypervisor's generated ID, so we can
properly remove the device.

This workaround will probably stay for a while, at least till someone
has enough cycles to implement a way to watch the device removal event
and then properly act on that.  Spoiler alert, this will be a complex
change that may not even be worth it considering the race can be avoided
with this commit.

Fixes: #4196

Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
(cherry picked from commit 33a8b70558)
Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
This commit is contained in:
Fabiano Fidêncio 2022-05-03 16:01:28 +02:00
parent d4dccb4900
commit 03bc89ab0b
2 changed files with 9 additions and 3 deletions

View File

@ -165,6 +165,7 @@ type cloudHypervisor struct {
APIClient clhClient APIClient clhClient
ctx context.Context ctx context.Context
id string id string
devicesIds map[string]string
vmconfig chclient.VmConfig vmconfig chclient.VmConfig
state CloudHypervisorState state CloudHypervisorState
config HypervisorConfig config HypervisorConfig
@ -360,6 +361,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
clh.id = id clh.id = id
clh.state.state = clhNotReady clh.state.state = clhNotReady
clh.devicesIds = make(map[string]string)
clh.Logger().WithField("function", "CreateVM").Info("creating Sandbox") clh.Logger().WithField("function", "CreateVM").Info("creating Sandbox")
@ -667,7 +669,6 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro
clhDisk := *chclient.NewDiskConfig(drive.File) clhDisk := *chclient.NewDiskConfig(drive.File)
clhDisk.Readonly = &drive.ReadOnly clhDisk.Readonly = &drive.ReadOnly
clhDisk.VhostUser = func(b bool) *bool { return &b }(false) clhDisk.VhostUser = func(b bool) *bool { return &b }(false)
clhDisk.Id = &driveID
pciInfo, _, err := cl.VmAddDiskPut(ctx, clhDisk) pciInfo, _, err := cl.VmAddDiskPut(ctx, clhDisk)
@ -675,6 +676,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro
return fmt.Errorf("failed to hotplug block device %+v %s", drive, openAPIClientError(err)) return fmt.Errorf("failed to hotplug block device %+v %s", drive, openAPIClientError(err))
} }
clh.devicesIds[driveID] = pciInfo.GetId()
drive.PCIPath, err = clhPciInfoToPath(pciInfo) drive.PCIPath, err = clhPciInfoToPath(pciInfo)
return err return err
@ -688,11 +690,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
// Create the clh device config via the constructor to ensure default values are properly assigned // Create the clh device config via the constructor to ensure default values are properly assigned
clhDevice := *chclient.NewVmAddDevice() clhDevice := *chclient.NewVmAddDevice()
clhDevice.Path = &device.SysfsDev clhDevice.Path = &device.SysfsDev
clhDevice.Id = &device.ID
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
if err != nil { if err != nil {
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
} }
clh.devicesIds[device.ID] = pciInfo.GetId()
// clh doesn't use bridges, so the PCI path is simply the slot // clh doesn't use bridges, so the PCI path is simply the slot
// number of the device. This will break if clh starts using // number of the device. This will break if clh starts using
@ -753,13 +755,15 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second) ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second)
defer cancel() defer cancel()
originalDeviceID := clh.devicesIds[deviceID]
remove := *chclient.NewVmRemoveDevice() remove := *chclient.NewVmRemoveDevice()
remove.Id = &deviceID remove.Id = &originalDeviceID
_, err := cl.VmRemoveDevicePut(ctx, remove) _, err := cl.VmRemoveDevicePut(ctx, remove)
if err != nil { if err != nil {
err = fmt.Errorf("failed to hotplug remove (unplug) device %+v: %s", devInfo, openAPIClientError(err)) err = fmt.Errorf("failed to hotplug remove (unplug) device %+v: %s", devInfo, openAPIClientError(err))
} }
delete(clh.devicesIds, deviceID)
return nil, err return nil, err
} }

View File

@ -384,6 +384,7 @@ func TestCloudHypervisorHotplugAddBlockDevice(t *testing.T) {
clh := &cloudHypervisor{} clh := &cloudHypervisor{}
clh.config = clhConfig clh.config = clhConfig
clh.APIClient = &clhClientMock{} clh.APIClient = &clhClientMock{}
clh.devicesIds = make(map[string]string)
clh.config.BlockDeviceDriver = config.VirtioBlock clh.config.BlockDeviceDriver = config.VirtioBlock
err = clh.hotplugAddBlockDevice(&config.BlockDrive{Pmem: false}) err = clh.hotplugAddBlockDevice(&config.BlockDrive{Pmem: false})
@ -406,6 +407,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
clh := &cloudHypervisor{} clh := &cloudHypervisor{}
clh.config = clhConfig clh.config = clhConfig
clh.APIClient = &clhClientMock{} clh.APIClient = &clhClientMock{}
clh.devicesIds = make(map[string]string)
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev) _, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
assert.NoError(err, "Hotplug remove block device expected no error") assert.NoError(err, "Hotplug remove block device expected no error")