Merge pull request #12679 from microsoft/user/romoh/gpu-fix

clh: Add VFIO device cold-plug support
This commit is contained in:
Aurélien Bombo
2026-03-27 11:12:51 -05:00
committed by GitHub
5 changed files with 141 additions and 5 deletions

View File

@@ -229,6 +229,12 @@ disable_image_nvdimm = @DEFDISABLEIMAGENVDIMM_CLH@
# The default setting is "no-port"
hot_plug_vfio = "no-port"
# In a confidential compute environment hot-plugging can compromise
# security.
# Enable cold-plugging of VFIO devices to a root-port.
# The default setting is "no-port", which means disabled.
cold_plug_vfio = "no-port"
# Path to OCI hook binaries in the *guest rootfs*.
# This does not affect host-side hooks which must instead be added to
# the OCI spec passed to the runtime.

View File

@@ -1950,11 +1950,11 @@ func checkPCIeConfig(coldPlug config.PCIePort, hotPlug config.PCIePort, machineT
return nil
}
if hypervisorType == vc.ClhHypervisor {
if coldPlug != config.NoPort {
return fmt.Errorf("cold-plug not supported on CLH")
if coldPlug != config.NoPort && coldPlug != config.RootPort {
return fmt.Errorf("only cold-plug=%s or %s supported on CLH", config.NoPort, config.RootPort)
}
if hotPlug != config.RootPort {
return fmt.Errorf("only hot-plug=%s supported on CLH", config.RootPort)
if hotPlug != config.NoPort && hotPlug != config.RootPort {
return fmt.Errorf("only hot-plug=%s or %s supported on CLH", config.NoPort, config.RootPort)
}
}

View File

@@ -430,9 +430,11 @@ func TestVfioChecksClh(t *testing.T) {
}
assert.NoError(f(config.NoPort, config.NoPort))
assert.NoError(f(config.NoPort, config.RootPort))
assert.NoError(f(config.RootPort, config.NoPort))
assert.Error(f(config.RootPort, config.RootPort))
assert.Error(f(config.RootPort, config.NoPort))
assert.Error(f(config.NoPort, config.SwitchPort))
assert.Error(f(config.SwitchPort, config.NoPort))
assert.Error(f(config.BridgePort, config.NoPort))
}
func TestVfioCheckQemu(t *testing.T) {

View File

@@ -976,6 +976,44 @@ func (clh *cloudHypervisor) hotplugAddBlockDevice(drive *config.BlockDrive) erro
return err
}
// coldPlugVFIODevice appends a VFIO device to the VM configuration so that it
// is present when the VM is created (before boot). Cloud Hypervisor's CreateVM
// API accepts a list of devices that are attached at VM creation time, which
// effectively provides cold-plug semantics — the guest sees the device on its
// PCI bus from the very first enumeration.
func (clh *cloudHypervisor) coldPlugVFIODevice(device *config.VFIODev) error {
switch device.Type {
case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType:
// Supported PCI VFIO device types for Cloud Hypervisor.
default:
return fmt.Errorf("VFIO device %+v has unsupported type %v; only PCI VFIO devices are supported in Cloud Hypervisor", device, device.Type)
}
if strings.TrimSpace(device.SysfsDev) == "" {
return fmt.Errorf("VFIO device %q has empty or invalid SysfsDev path", device.ID)
}
clh.Logger().WithFields(log.Fields{
"device": device.ID,
"sysfs": device.SysfsDev,
"bdf": device.BDF,
}).Info("Cold-plugging VFIO device into VM config")
clhDevice := *chclient.NewDeviceConfig(device.SysfsDev)
clhDevice.SetIommu(clh.config.IOMMU)
clhDevice.SetId(device.ID)
if clh.vmconfig.Devices != nil {
*clh.vmconfig.Devices = append(*clh.vmconfig.Devices, clhDevice)
} else {
clh.vmconfig.Devices = &[]chclient.DeviceConfig{clhDevice}
}
// Track the device ID so that it can be referenced later (e.g. for removal).
clh.devicesIds[device.ID] = device.ID
return nil
}
func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error {
cl := clh.client()
ctx, cancel := context.WithTimeout(context.Background(), clhHotPlugAPITimeout*time.Second)
@@ -1342,6 +1380,8 @@ func (clh *cloudHypervisor) AddDevice(ctx context.Context, devInfo interface{},
clh.addVSock(defaultGuestVSockCID, v.UdsPath)
case types.Volume:
err = clh.addVolume(v)
case config.VFIODev:
err = clh.coldPlugVFIODevice(&v)
default:
clh.Logger().WithField("function", "AddDevice").Warnf("Add device of type %v is not supported.", v)
return fmt.Errorf("Not implemented support for %s", v)

View File

@@ -682,6 +682,94 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) {
assert.Error(err, "Hotplug remove pmem block device expected error")
}
func TestCloudHypervisorColdPlugVFIODevice(t *testing.T) {
assert := assert.New(t)
clhConfig, err := newClhConfig()
assert.NoError(err)
clh := &cloudHypervisor{}
clh.config = clhConfig
clh.devicesIds = make(map[string]string)
clh.vmconfig = *chclient.NewVmConfig(*chclient.NewPayloadConfig())
// Cold-plug a PCI VFIO device
dev := &config.VFIODev{
ID: "gpu0",
SysfsDev: "/sys/bus/pci/devices/0000:41:00.0",
BDF: "0000:41:00.0",
Type: config.VFIOPCIDeviceNormalType,
}
err = clh.coldPlugVFIODevice(dev)
assert.NoError(err, "Cold-plug PCI VFIO device expected no error")
// Verify the device was added to vmconfig.Devices
assert.NotNil(clh.vmconfig.Devices)
assert.Len(*clh.vmconfig.Devices, 1)
assert.Equal("/sys/bus/pci/devices/0000:41:00.0", (*clh.vmconfig.Devices)[0].Path)
assert.Equal("gpu0", clh.devicesIds["gpu0"])
// Cold-plug a second device
dev2 := &config.VFIODev{
ID: "gpu1",
SysfsDev: "/sys/bus/pci/devices/0000:42:00.0",
BDF: "0000:42:00.0",
Type: config.VFIOPCIDeviceNormalType,
}
err = clh.coldPlugVFIODevice(dev2)
assert.NoError(err, "Cold-plug second VFIO device expected no error")
assert.Len(*clh.vmconfig.Devices, 2)
// AP mediated device should fail
apDev := &config.VFIODev{
ID: "ap0",
Type: config.VFIOAPDeviceMediatedType,
}
err = clh.coldPlugVFIODevice(apDev)
assert.Error(err, "Cold-plug AP mediated device expected error")
// Error type (0) should fail
errDev := &config.VFIODev{
ID: "bad0",
SysfsDev: "/sys/bus/pci/devices/0000:43:00.0",
Type: config.VFIODeviceErrorType,
}
err = clh.coldPlugVFIODevice(errDev)
assert.Error(err, "Cold-plug error-type device expected error")
// Empty SysfsDev should fail
emptySysfsDev := &config.VFIODev{
ID: "bad1",
Type: config.VFIOPCIDeviceNormalType,
}
err = clh.coldPlugVFIODevice(emptySysfsDev)
assert.Error(err, "Cold-plug with empty SysfsDev expected error")
}
func TestCloudHypervisorAddDeviceVFIO(t *testing.T) {
assert := assert.New(t)
clhConfig, err := newClhConfig()
assert.NoError(err)
clh := &cloudHypervisor{}
clh.config = clhConfig
clh.devicesIds = make(map[string]string)
clh.vmconfig = *chclient.NewVmConfig(*chclient.NewPayloadConfig())
// AddDevice with VFIODev type should cold-plug
dev := config.VFIODev{
ID: "nic0",
SysfsDev: "/sys/bus/pci/devices/0000:05:00.0",
BDF: "0000:05:00.0",
Type: config.VFIOPCIDeviceNormalType,
}
err = clh.AddDevice(context.Background(), dev, VfioDev)
assert.NoError(err, "AddDevice VFIO expected no error")
assert.NotNil(clh.vmconfig.Devices)
assert.Len(*clh.vmconfig.Devices, 1)
}
func TestClhGenerateSocket(t *testing.T) {
assert := assert.New(t)