From 33a8b705588d6f65163e05bf83119db4360b04ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Tue, 3 May 2022 16:01:28 +0200 Subject: [PATCH] clh: Rely on Cloud Hypervisor for generating the device ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're currently hitting a race condition on the Cloud Hypervisor's driver code when quickly removing and adding a block device. This happens because the device removal is an asynchronous operation, and we currently do *not* monitor events coming from Cloud Hypervisor to know when the device was actually removed. Together with this, the sandbox code doesn't know about that and when a new device is attached it'll quickly assign what may be the very same ID to the new device, leading to the Cloud Hypervisor's driver trying to hotplug a device with the very same ID of the device that was not yet removed. This is, in a nutshell, why the tests with Cloud Hypervisor and devmapper have been failing every now and then. The workaround taken to solve the issue is basically *not* passing down the device ID to Cloud Hypervisor and simply letting Cloud Hypervisor itself generate those, as Cloud Hypervisor does it in a manner that avoids such conflicts. With this addition we have then to keep a map of the device ID and the Cloud Hypervisor's generated ID, so we can properly remove the device. This workaround will probably stay for a while, at least till someone has enough cycles to implement a way to watch the device removal event and then properly act on that. Spoiler alert, this will be a complex change that may not even be worth it considering the race can be avoided with this commit. Fixes: #4176 Signed-off-by: Fabiano FidĂȘncio --- src/runtime/virtcontainers/clh.go | 10 +++++++--- src/runtime/virtcontainers/clh_test.go | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 41d00ee27..c82d0d42c 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -165,6 +165,7 @@ type cloudHypervisor struct { APIClient clhClient ctx context.Context id string + devicesIds map[string]string vmconfig chclient.VmConfig state CloudHypervisorState config HypervisorConfig @@ -358,6 +359,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net clh.id = id clh.state.state = clhNotReady + clh.devicesIds = make(map[string]string) clh.Logger().WithField("function", "CreateVM").Info("creating Sandbox") @@ -670,7 +672,6 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro clhDisk := *chclient.NewDiskConfig(drive.File) clhDisk.Readonly = &drive.ReadOnly clhDisk.VhostUser = func(b bool) *bool { return &b }(false) - clhDisk.Id = &driveID diskRateLimiterConfig := clh.getDiskRateLimiterConfig() if diskRateLimiterConfig != nil { @@ -683,6 +684,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro return fmt.Errorf("failed to hotplug block device %+v %s", drive, openAPIClientError(err)) } + clh.devicesIds[driveID] = pciInfo.GetId() drive.PCIPath, err = clhPciInfoToPath(pciInfo) return err @@ -696,11 +698,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { // Create the clh device config via the constructor to ensure default values are properly assigned clhDevice := *chclient.NewVmAddDevice() clhDevice.Path = &device.SysfsDev - clhDevice.Id = &device.ID pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } + clh.devicesIds[device.ID] = pciInfo.GetId() // clh doesn't use bridges, so the PCI path is simply the slot // number of the device. This will break if clh starts using @@ -761,13 +763,15 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second) defer cancel() + originalDeviceID := clh.devicesIds[deviceID] remove := *chclient.NewVmRemoveDevice() - remove.Id = &deviceID + remove.Id = &originalDeviceID _, err := cl.VmRemoveDevicePut(ctx, remove) if err != nil { err = fmt.Errorf("failed to hotplug remove (unplug) device %+v: %s", devInfo, openAPIClientError(err)) } + delete(clh.devicesIds, deviceID) return nil, err } diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index a764910f4..79e35210b 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -563,6 +563,7 @@ func TestCloudHypervisorHotplugAddBlockDevice(t *testing.T) { clh := &cloudHypervisor{} clh.config = clhConfig clh.APIClient = &clhClientMock{} + clh.devicesIds = make(map[string]string) clh.config.BlockDeviceDriver = config.VirtioBlock err = clh.hotplugAddBlockDevice(&config.BlockDrive{Pmem: false}) @@ -585,6 +586,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) { clh := &cloudHypervisor{} clh.config = clhConfig clh.APIClient = &clhClientMock{} + clh.devicesIds = make(map[string]string) _, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev) assert.NoError(err, "Hotplug remove block device expected no error")