mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-07-19 18:01:01 +00:00
clh: Rely on Cloud Hypervisor for generating the device ID
We're currently hitting a race condition on the Cloud Hypervisor's driver code when quickly removing and adding a block device. This happens because the device removal is an asynchronous operation, and we currently do *not* monitor events coming from Cloud Hypervisor to know when the device was actually removed. Together with this, the sandbox code doesn't know about that and when a new device is attached it'll quickly assign what may be the very same ID to the new device, leading to the Cloud Hypervisor's driver trying to hotplug a device with the very same ID of the device that was not yet removed. This is, in a nutshell, why the tests with Cloud Hypervisor and devmapper have been failing every now and then. The workaround taken to solve the issue is basically *not* passing down the device ID to Cloud Hypervisor and simply letting Cloud Hypervisor itself generate those, as Cloud Hypervisor does it in a manner that avoids such conflicts. With this addition we have then to keep a map of the device ID and the Cloud Hypervisor's generated ID, so we can properly remove the device. This workaround will probably stay for a while, at least till someone has enough cycles to implement a way to watch the device removal event and then properly act on that. Spoiler alert, this will be a complex change that may not even be worth it considering the race can be avoided with this commit. Fixes: #4176 Signed-off-by: Fabiano Fidêncio <fabiano.fidencio@intel.com>
This commit is contained in:
parent
c633780ba7
commit
33a8b70558
@ -165,6 +165,7 @@ type cloudHypervisor struct {
|
|||||||
APIClient clhClient
|
APIClient clhClient
|
||||||
ctx context.Context
|
ctx context.Context
|
||||||
id string
|
id string
|
||||||
|
devicesIds map[string]string
|
||||||
vmconfig chclient.VmConfig
|
vmconfig chclient.VmConfig
|
||||||
state CloudHypervisorState
|
state CloudHypervisorState
|
||||||
config HypervisorConfig
|
config HypervisorConfig
|
||||||
@ -358,6 +359,7 @@ func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Net
|
|||||||
|
|
||||||
clh.id = id
|
clh.id = id
|
||||||
clh.state.state = clhNotReady
|
clh.state.state = clhNotReady
|
||||||
|
clh.devicesIds = make(map[string]string)
|
||||||
|
|
||||||
clh.Logger().WithField("function", "CreateVM").Info("creating Sandbox")
|
clh.Logger().WithField("function", "CreateVM").Info("creating Sandbox")
|
||||||
|
|
||||||
@ -670,7 +672,6 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro
|
|||||||
clhDisk := *chclient.NewDiskConfig(drive.File)
|
clhDisk := *chclient.NewDiskConfig(drive.File)
|
||||||
clhDisk.Readonly = &drive.ReadOnly
|
clhDisk.Readonly = &drive.ReadOnly
|
||||||
clhDisk.VhostUser = func(b bool) *bool { return &b }(false)
|
clhDisk.VhostUser = func(b bool) *bool { return &b }(false)
|
||||||
clhDisk.Id = &driveID
|
|
||||||
|
|
||||||
diskRateLimiterConfig := clh.getDiskRateLimiterConfig()
|
diskRateLimiterConfig := clh.getDiskRateLimiterConfig()
|
||||||
if diskRateLimiterConfig != nil {
|
if diskRateLimiterConfig != nil {
|
||||||
@ -683,6 +684,7 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro
|
|||||||
return fmt.Errorf("failed to hotplug block device %+v %s", drive, openAPIClientError(err))
|
return fmt.Errorf("failed to hotplug block device %+v %s", drive, openAPIClientError(err))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
clh.devicesIds[driveID] = pciInfo.GetId()
|
||||||
drive.PCIPath, err = clhPciInfoToPath(pciInfo)
|
drive.PCIPath, err = clhPciInfoToPath(pciInfo)
|
||||||
|
|
||||||
return err
|
return err
|
||||||
@ -696,11 +698,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
|
|||||||
// Create the clh device config via the constructor to ensure default values are properly assigned
|
// Create the clh device config via the constructor to ensure default values are properly assigned
|
||||||
clhDevice := *chclient.NewVmAddDevice()
|
clhDevice := *chclient.NewVmAddDevice()
|
||||||
clhDevice.Path = &device.SysfsDev
|
clhDevice.Path = &device.SysfsDev
|
||||||
clhDevice.Id = &device.ID
|
|
||||||
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
|
pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
|
return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err))
|
||||||
}
|
}
|
||||||
|
clh.devicesIds[device.ID] = pciInfo.GetId()
|
||||||
|
|
||||||
// clh doesn't use bridges, so the PCI path is simply the slot
|
// clh doesn't use bridges, so the PCI path is simply the slot
|
||||||
// number of the device. This will break if clh starts using
|
// number of the device. This will break if clh starts using
|
||||||
@ -761,13 +763,15 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int
|
|||||||
ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
|
originalDeviceID := clh.devicesIds[deviceID]
|
||||||
remove := *chclient.NewVmRemoveDevice()
|
remove := *chclient.NewVmRemoveDevice()
|
||||||
remove.Id = &deviceID
|
remove.Id = &originalDeviceID
|
||||||
_, err := cl.VmRemoveDevicePut(ctx, remove)
|
_, err := cl.VmRemoveDevicePut(ctx, remove)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err = fmt.Errorf("failed to hotplug remove (unplug) device %+v: %s", devInfo, openAPIClientError(err))
|
err = fmt.Errorf("failed to hotplug remove (unplug) device %+v: %s", devInfo, openAPIClientError(err))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delete(clh.devicesIds, deviceID)
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -563,6 +563,7 @@ func TestCloudHypervisorHotplugAddBlockDevice(t *testing.T) {
|
|||||||
clh := &cloudHypervisor{}
|
clh := &cloudHypervisor{}
|
||||||
clh.config = clhConfig
|
clh.config = clhConfig
|
||||||
clh.APIClient = &clhClientMock{}
|
clh.APIClient = &clhClientMock{}
|
||||||
|
clh.devicesIds = make(map[string]string)
|
||||||
|
|
||||||
clh.config.BlockDeviceDriver = config.VirtioBlock
|
clh.config.BlockDeviceDriver = config.VirtioBlock
|
||||||
err = clh.hotplugAddBlockDevice(&config.BlockDrive{Pmem: false})
|
err = clh.hotplugAddBlockDevice(&config.BlockDrive{Pmem: false})
|
||||||
@ -585,6 +586,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
|
|||||||
clh := &cloudHypervisor{}
|
clh := &cloudHypervisor{}
|
||||||
clh.config = clhConfig
|
clh.config = clhConfig
|
||||||
clh.APIClient = &clhClientMock{}
|
clh.APIClient = &clhClientMock{}
|
||||||
|
clh.devicesIds = make(map[string]string)
|
||||||
|
|
||||||
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
|
_, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev)
|
||||||
assert.NoError(err, "Hotplug remove block device expected no error")
|
assert.NoError(err, "Hotplug remove block device expected no error")
|
||||||
|
Loading…
Reference in New Issue
Block a user