diff --git a/src/runtime/cmd/kata-runtime/kata-env.go b/src/runtime/cmd/kata-runtime/kata-env.go index f17480aba0..45ddfbf3e5 100644 --- a/src/runtime/cmd/kata-runtime/kata-env.go +++ b/src/runtime/cmd/kata-runtime/kata-env.go @@ -115,6 +115,7 @@ type HypervisorInfo struct { MemorySlots uint32 PCIeRootPort uint32 ColdPlugVFIO hv.PCIePort + PCIeSwitchPort uint32 HotplugVFIOOnRootBus bool Debug bool } diff --git a/src/runtime/pkg/device/config/config.go b/src/runtime/pkg/device/config/config.go index dee6291ed9..f3a0e879ab 100644 --- a/src/runtime/pkg/device/config/config.go +++ b/src/runtime/pkg/device/config/config.go @@ -114,8 +114,8 @@ const ( // SysDevPrefix is static string of /sys/dev var SysDevPrefix = "/sys/dev" -// SysIOMMUPath is static string of /sys/kernel/iommu_groups -var SysIOMMUPath = "/sys/kernel/iommu_groups" +// SysIOMMUGroupPath is static string of /sys/kernel/iommu_groups +var SysIOMMUGroupPath = "/sys/kernel/iommu_groups" // SysBusPciDevicesPath is static string of /sys/bus/pci/devices var SysBusPciDevicesPath = "/sys/bus/pci/devices" @@ -268,14 +268,8 @@ const ( VFIOAPDeviceMediatedType ) -type VFIODev interface { - GetID() *string - GetType() VFIODeviceType - GetSysfsDev() *string -} - // VFIOPCIDev represents a VFIO PCI device used for hotplugging -type VFIOPCIDev struct { +type VFIODev struct { // ID is used to identify this drive in the hypervisor options. ID string @@ -305,44 +299,12 @@ type VFIOPCIDev struct { // IsPCIe specifies device is PCIe or PCI IsPCIe bool -} - -func (d VFIOPCIDev) GetID() *string { - return &d.ID -} - -func (d VFIOPCIDev) GetType() VFIODeviceType { - return d.Type -} - -func (d VFIOPCIDev) GetSysfsDev() *string { - return &d.SysfsDev -} - -type VFIOAPDev struct { - // ID is used to identify this drive in the hypervisor options. - ID string - - // sysfsdev of VFIO mediated device - SysfsDev string // APDevices are the Adjunct Processor devices assigned to the mdev APDevices []string - // Type of VFIO device - Type VFIODeviceType -} - -func (d VFIOAPDev) GetID() *string { - return &d.ID -} - -func (d VFIOAPDev) GetType() VFIODeviceType { - return d.Type -} - -func (d VFIOAPDev) GetSysfsDev() *string { - return &d.SysfsDev + // Rank identifies a device in a IOMMU group + Rank int } // RNGDev represents a random number generator device diff --git a/src/runtime/pkg/device/drivers/utils.go b/src/runtime/pkg/device/drivers/utils.go index bfffa31a2e..19ac99c30b 100644 --- a/src/runtime/pkg/device/drivers/utils.go +++ b/src/runtime/pkg/device/drivers/utils.go @@ -47,9 +47,9 @@ func deviceLogger() *logrus.Entry { return api.DeviceLogger() } -// Identify PCIe device by reading the size of the PCI config space +// IsPCIeDevice Identify PCIe device by reading the size of the PCI config space // Plain PCI device have 256 bytes of config space where PCIe devices have 4K -func isPCIeDevice(bdf string) bool { +func IsPCIeDevice(bdf string) bool { if len(strings.Split(bdf, ":")) == 2 { bdf = PCIDomain + ":" + bdf } @@ -164,7 +164,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme vfioDevs := []*config.VFIODev{} vfioGroup := filepath.Base(device.HostPath) - iommuDevicesPath := filepath.Join(config.SysIOMMUPath, vfioGroup, "devices") + iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioGroup, "devices") deviceFiles, err := os.ReadDir(iommuDevicesPath) if err != nil { @@ -174,7 +174,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme // Pass all devices in iommu group for i, deviceFile := range deviceFiles { //Get bdf of device eg 0000:00:1c.0 - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(deviceFile.Name(), iommuDevicesPath) + deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(deviceFile.Name(), iommuDevicesPath) if err != nil { return nil, err } @@ -196,15 +196,16 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme switch vfioDeviceType { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - isPCIe := isPCIeDevice(deviceBDF) + isPCIe := IsPCIeDevice(deviceBDF) // Do not directly assign to `vfio` -- need to access field still - vfioPCI := config.VFIOPCIDev{ + vfioPCI := config.VFIODev{ ID: id, Type: vfioDeviceType, BDF: deviceBDF, SysfsDev: deviceSysfsDev, IsPCIe: isPCIe, Class: pciClass, + Rank: -1, } if isPCIe && !ignoreBusAssignment { vfioPCI.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, len(AllPCIeDevs)) @@ -216,7 +217,7 @@ func GetAllVFIODevicesFromIOMMUGroup(device config.DeviceInfo, ignoreBusAssignme if err != nil { return nil, err } - vfio = config.VFIOAPDev{ + vfio = config.VFIODev{ ID: id, SysfsDev: deviceSysfsDev, Type: config.VFIOAPDeviceMediatedType, diff --git a/src/runtime/pkg/device/drivers/vfio.go b/src/runtime/pkg/device/drivers/vfio.go index 106220dcff..4231f0d301 100644 --- a/src/runtime/pkg/device/drivers/vfio.go +++ b/src/runtime/pkg/device/drivers/vfio.go @@ -33,6 +33,9 @@ const ( ) var ( + // AllPCIeDevs deduces the correct bus number. The BDF keeps track that + // we're not accounting for the very same device, if a user provides the + // devices multiple times. AllPCIeDevs = map[string]bool{} ) @@ -169,23 +172,18 @@ func (device *VFIODevice) Load(ds config.DeviceState) { for _, dev := range ds.VFIODevs { var vfio config.VFIODev - vfioDeviceType := (*device.VfioDevs[0]).GetType() - switch vfioDeviceType { + switch dev.Type { case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - bdf := "" - if pciDev, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDev.BDF - } - vfio = config.VFIOPCIDev{ - ID: *(*dev).GetID(), - Type: config.VFIODeviceType((*dev).GetType()), - BDF: bdf, - SysfsDev: *(*dev).GetSysfsDev(), + vfio = config.VFIODev{ + ID: dev.ID, + Type: config.VFIODeviceType(dev.Type), + BDF: dev.BDF, + SysfsDev: dev.SysfsDev, } case config.VFIOAPDeviceMediatedType: - vfio = config.VFIOAPDev{ - ID: *(*dev).GetID(), - SysfsDev: *(*dev).GetSysfsDev(), + vfio = config.VFIODev{ + ID: dev.ID, + SysfsDev: dev.SysfsDev, } default: deviceLogger().WithError( @@ -200,7 +198,7 @@ func (device *VFIODevice) Load(ds config.DeviceState) { // It should implement GetAttachCount() and DeviceID() as api.Device implementation // here it shares function from *GenericDevice so we don't need duplicate codes -func getVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { +func GetVFIODetails(deviceFileName, iommuDevicesPath string) (deviceBDF, deviceSysfsDev string, vfioDeviceType config.VFIODeviceType, err error) { sysfsDevStr := filepath.Join(iommuDevicesPath, deviceFileName) vfioDeviceType, err = GetVFIODeviceType(sysfsDevStr) if err != nil { diff --git a/src/runtime/pkg/device/drivers/vfio_test.go b/src/runtime/pkg/device/drivers/vfio_test.go index 6a1ab61eb6..2ded0f8507 100644 --- a/src/runtime/pkg/device/drivers/vfio_test.go +++ b/src/runtime/pkg/device/drivers/vfio_test.go @@ -29,7 +29,7 @@ func TestGetVFIODetails(t *testing.T) { } for _, d := range data { - deviceBDF, deviceSysfsDev, vfioDeviceType, err := getVFIODetails(d.deviceStr, "") + deviceBDF, deviceSysfsDev, vfioDeviceType, err := GetVFIODetails(d.deviceStr, "") switch vfioDeviceType { case config.VFIOPCIDeviceNormalType: diff --git a/src/runtime/pkg/device/manager/manager_test.go b/src/runtime/pkg/device/manager/manager_test.go index 49e339f60b..70c76b67d7 100644 --- a/src/runtime/pkg/device/manager/manager_test.go +++ b/src/runtime/pkg/device/manager/manager_test.go @@ -116,14 +116,14 @@ func TestAttachVFIODevice(t *testing.T) { _, err = os.Create(deviceConfigFile) assert.Nil(t, err) - savedIOMMUPath := config.SysIOMMUPath - config.SysIOMMUPath = tmpDir + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir savedSysBusPciDevicesPath := config.SysBusPciDevicesPath config.SysBusPciDevicesPath = devicesDir defer func() { - config.SysIOMMUPath = savedIOMMUPath + config.SysIOMMUGroupPath = savedIOMMUPath config.SysBusPciDevicesPath = savedSysBusPciDevicesPath }() diff --git a/src/runtime/pkg/govmm/qemu/qemu.go b/src/runtime/pkg/govmm/qemu/qemu.go index 06d7e58957..7b4b9037f8 100644 --- a/src/runtime/pkg/govmm/qemu/qemu.go +++ b/src/runtime/pkg/govmm/qemu/qemu.go @@ -123,6 +123,14 @@ const ( // PCIeRootPort is a PCIe Root Port, the PCIe device should be hotplugged to this port. PCIeRootPort DeviceDriver = "pcie-root-port" + // PCIeSwitchUpstreamPort is a PCIe switch upstream port + // A upstream port connects to a PCIe Root Port + PCIeSwitchUpstreamPort DeviceDriver = "x3130-upstream" + + // PCIeSwitchDownstreamPort is a PCIe switch downstream port + // PCIe devices can be hot-plugged to the downstream port. + PCIeSwitchDownstreamPort DeviceDriver = "xio3130-downstream" + // Loader is the Loader device driver. Loader DeviceDriver = "loader" @@ -1681,6 +1689,106 @@ func (b PCIeRootPortDevice) Valid() bool { return true } +// PCIeSwitchUpstreamPortDevice is the port connecting to the root port +type PCIeSwitchUpstreamPortDevice struct { + ID string // format: sup{n}, n>=0 + Bus string // default is rp0 +} + +// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice. +func (b PCIeSwitchUpstreamPortDevice) QemuParams(config *Config) []string { + var qemuParams []string + var deviceParams []string + + driver := PCIeSwitchUpstreamPort + + deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus)) + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + return qemuParams +} + +// Valid returns true if the PCIeSwitchUpstreamPortDevice structure is valid and complete. +func (b PCIeSwitchUpstreamPortDevice) Valid() bool { + if b.ID == "" { + return false + } + if b.Bus == "" { + return false + } + return true +} + +// PCIeSwitchDownstreamPortDevice is the port connecting to the root port +type PCIeSwitchDownstreamPortDevice struct { + ID string // format: sup{n}, n>=0 + Bus string // default is rp0 + Chassis string // (slot, chassis) pair is mandatory and must be unique for each downstream port, >=0, default is 0x00 + Slot string // >=0, default is 0x00 + // This to work needs patches to QEMU + BusReserve string + // Pref64 and Pref32 are not allowed to be set simultaneously + Pref64Reserve string // reserve prefetched MMIO aperture, 64-bit + Pref32Reserve string // reserve prefetched MMIO aperture, 32-bit + MemReserve string // reserve non-prefetched MMIO aperture, 32-bit *only* + IOReserve string // IO reservation + +} + +// QemuParams returns the qemu parameters built out of the PCIeSwitchUpstreamPortDevice. +func (b PCIeSwitchDownstreamPortDevice) QemuParams(config *Config) []string { + var qemuParams []string + var deviceParams []string + driver := PCIeSwitchDownstreamPort + + deviceParams = append(deviceParams, fmt.Sprintf("%s,id=%s", driver, b.ID)) + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", b.Bus)) + deviceParams = append(deviceParams, fmt.Sprintf("chassis=%s", b.Chassis)) + deviceParams = append(deviceParams, fmt.Sprintf("slot=%s", b.Slot)) + if b.BusReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("bus-reserve=%s", b.BusReserve)) + } + + if b.Pref64Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref64-reserve=%s", b.Pref64Reserve)) + } + + if b.Pref32Reserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("pref32-reserve=%s", b.Pref32Reserve)) + } + + if b.MemReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("mem-reserve=%s", b.MemReserve)) + } + + if b.IOReserve != "" { + deviceParams = append(deviceParams, fmt.Sprintf("io-reserve=%s", b.IOReserve)) + } + + qemuParams = append(qemuParams, "-device") + qemuParams = append(qemuParams, strings.Join(deviceParams, ",")) + return qemuParams +} + +// Valid returns true if the PCIeSwitchUpstremPortDevice structure is valid and complete. +func (b PCIeSwitchDownstreamPortDevice) Valid() bool { + if b.ID == "" { + return false + } + if b.Bus == "" { + return false + } + if b.Chassis == "" { + return false + } + if b.Slot == "" { + return false + } + return true +} + // VFIODevice represents a qemu vfio device meant for direct access by guest OS. type VFIODevice struct { // Bus-Device-Function of device diff --git a/src/runtime/pkg/govmm/qemu/qmp.go b/src/runtime/pkg/govmm/qemu/qmp.go index 6b020103da..5630ff9dec 100644 --- a/src/runtime/pkg/govmm/qemu/qmp.go +++ b/src/runtime/pkg/govmm/qemu/qmp.go @@ -656,6 +656,7 @@ func (q *QMP) executeCommand(ctx context.Context, name string, args map[string]i filter *qmpEventFilter) error { _, err := q.executeCommandWithResponse(ctx, name, args, nil, filter) + return err } @@ -1191,6 +1192,7 @@ func (q *QMP) ExecutePCIVFIODeviceAdd(ctx context.Context, devID, bdf, addr, bus if bus != "" { args["bus"] = bus } + return q.executeCommand(ctx, "device_add", args, nil) } diff --git a/src/runtime/pkg/hypervisors/hypervisor_state.go b/src/runtime/pkg/hypervisors/hypervisor_state.go index 482b7e9e20..735ac089b4 100644 --- a/src/runtime/pkg/hypervisors/hypervisor_state.go +++ b/src/runtime/pkg/hypervisors/hypervisor_state.go @@ -58,7 +58,6 @@ func (p PCIePort) String() string { type HypervisorState struct { BlockIndexMap map[int]struct{} - // Type of hypervisor, E.g. qemu/firecracker/acrn. Type string UUID string @@ -68,6 +67,7 @@ type HypervisorState struct { // Belows are qemu specific // Refs: virtcontainers/qemu.go:QemuState Bridges []Bridge + // HotpluggedCPUs is the list of CPUs that were hot-added HotpluggedVCPUs []CPUDevice @@ -75,6 +75,8 @@ type HypervisorState struct { VirtiofsDaemonPid int Pid int PCIeRootPort int + PCIeSwitchPort int ColdPlugVFIO PCIePort + HotPlugVFIO PCIePort HotplugVFIOOnRootBus bool } diff --git a/src/runtime/pkg/katautils/config-settings.go.in b/src/runtime/pkg/katautils/config-settings.go.in index 139d548264..0de02ce40e 100644 --- a/src/runtime/pkg/katautils/config-settings.go.in +++ b/src/runtime/pkg/katautils/config-settings.go.in @@ -109,3 +109,5 @@ const defaultVMCacheEndpoint string = "/var/run/kata-containers/cache.sock" var defaultRuntimeConfiguration = "@CONFIG_PATH@" const defaultColdPlugVFIO = hv.NoPort +const defaultHotPlugVFIO = hv.BridgePort +const defaultPCIeSwitchPort = 0 diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go index 6b08f4afe1..e38526fae9 100644 --- a/src/runtime/pkg/katautils/config.go +++ b/src/runtime/pkg/katautils/config.go @@ -76,7 +76,6 @@ type factory struct { VMCacheNumber uint `toml:"vm_cache_number"` Template bool `toml:"enable_template"` } - type hypervisor struct { Path string `toml:"path"` JailerPath string `toml:"jailer_path"` @@ -131,6 +130,7 @@ type hypervisor struct { MemSlots uint32 `toml:"memory_slots"` DefaultBridges uint32 `toml:"default_bridges"` Msize9p uint32 `toml:"msize_9p"` + PCIeSwitchPort uint32 `toml:"pcie_switch_port"` PCIeRootPort uint32 `toml:"pcie_root_port"` NumVCPUs int32 `toml:"default_vcpus"` BlockDeviceCacheSet bool `toml:"block_device_cache_set"` @@ -149,6 +149,7 @@ type hypervisor struct { EnableIOThreads bool `toml:"enable_iothreads"` DisableImageNvdimm bool `toml:"disable_image_nvdimm"` HotplugVFIOOnRootBus bool `toml:"hotplug_vfio_on_root_bus"` + HotPlugVFIO hv.PCIePort `toml:"hotplug_vfio"` ColdPlugVFIO hv.PCIePort `toml:"cold_plug_vfio"` DisableVhostNet bool `toml:"disable_vhost_net"` GuestMemoryDumpPaging bool `toml:"guest_memory_dump_paging"` @@ -293,6 +294,12 @@ func (h hypervisor) coldPlugVFIO() hv.PCIePort { } return h.ColdPlugVFIO } +func (h hypervisor) hotPlugVFIO() hv.PCIePort { + if h.HotPlugVFIO == "" { + return defaultHotPlugVFIO + } + return h.HotPlugVFIO +} func (h hypervisor) firmwareVolume() (string, error) { p := h.FirmwareVolume @@ -864,7 +871,9 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { DisableImageNvdimm: h.DisableImageNvdimm, HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), + HotPlugVFIO: h.HotPlugVFIO, PCIeRootPort: h.PCIeRootPort, + PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: h.DisableVhostNet, EnableVhostUserStore: h.EnableVhostUserStore, VhostUserStorePath: h.vhostUserStorePath(), @@ -877,7 +886,6 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { GuestMemoryDumpPath: h.GuestMemoryDumpPath, GuestMemoryDumpPaging: h.GuestMemoryDumpPaging, ConfidentialGuest: h.ConfidentialGuest, - SevSnpGuest: h.SevSnpGuest, GuestSwap: h.GuestSwap, Rootless: h.Rootless, LegacySerial: h.LegacySerial, @@ -1059,7 +1067,9 @@ func newClhHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) { Msize9p: h.msize9p(), HotplugVFIOOnRootBus: h.HotplugVFIOOnRootBus, ColdPlugVFIO: h.coldPlugVFIO(), + HotPlugVFIO: h.hotPlugVFIO(), PCIeRootPort: h.PCIeRootPort, + PCIeSwitchPort: h.PCIeSwitchPort, DisableVhostNet: true, GuestHookPath: h.guestHookPath(), VirtioFSExtraArgs: h.VirtioFSExtraArgs, @@ -1290,6 +1300,8 @@ func GetDefaultHypervisorConfig() vc.HypervisorConfig { Msize9p: defaultMsize9p, HotplugVFIOOnRootBus: defaultHotplugVFIOOnRootBus, ColdPlugVFIO: defaultColdPlugVFIO, + HotPlugVFIO: defaultHotPlugVFIO, + PCIeSwitchPort: defaultPCIeSwitchPort, PCIeRootPort: defaultPCIeRootPort, GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, diff --git a/src/runtime/pkg/katautils/config_test.go b/src/runtime/pkg/katautils/config_test.go index 171f011b8e..fb506ba6a4 100644 --- a/src/runtime/pkg/katautils/config_test.go +++ b/src/runtime/pkg/katautils/config_test.go @@ -566,6 +566,7 @@ func TestMinimalRuntimeConfig(t *testing.T) { GuestHookPath: defaultGuestHookPath, VhostUserStorePath: defaultVhostUserStorePath, VirtioFSCache: defaultVirtioFSCacheMode, + HotPlugVFIO: defaultHotPlugVFIO, BlockDeviceAIO: defaultBlockDeviceAIO, DisableGuestSeLinux: defaultDisableGuestSeLinux, ColdPlugVFIO: defaultColdPlugVFIO, diff --git a/src/runtime/pkg/oci/utils_test.go b/src/runtime/pkg/oci/utils_test.go index 7b2c10f2f0..f3899f1b81 100644 --- a/src/runtime/pkg/oci/utils_test.go +++ b/src/runtime/pkg/oci/utils_test.go @@ -821,22 +821,22 @@ func TestRegexpContains(t *testing.T) { //nolint: govet type testData struct { - regexps []string toMatch string + regexps []string expected bool } data := []testData{ - {[]string{}, "", false}, - {[]string{}, "nonempty", false}, - {[]string{"simple"}, "simple", true}, - {[]string{"simple"}, "some_simple_text", true}, - {[]string{"simple"}, "simp", false}, - {[]string{"one", "two"}, "one", true}, - {[]string{"one", "two"}, "two", true}, - {[]string{"o*"}, "oooo", true}, - {[]string{"o*"}, "oooa", true}, - {[]string{"^o*$"}, "oooa", false}, + {regexps: []string{}, toMatch: "", expected: false}, + {regexps: []string{}, toMatch: "nonempty", expected: false}, + {regexps: []string{"simple"}, toMatch: "simple", expected: true}, + {regexps: []string{"simple"}, toMatch: "some_simple_text", expected: true}, + {regexps: []string{"simple"}, toMatch: "simp", expected: false}, + {regexps: []string{"one", "two"}, toMatch: "one", expected: true}, + {regexps: []string{"one", "two"}, toMatch: "two", expected: true}, + {regexps: []string{"o*"}, toMatch: "oooo", expected: true}, + {regexps: []string{"o*"}, toMatch: "oooa", expected: true}, + {regexps: []string{"^o*$"}, toMatch: "oooa", expected: false}, } for _, d := range data { @@ -850,25 +850,25 @@ func TestCheckPathIsInGlobs(t *testing.T) { //nolint: govet type testData struct { - globs []string toMatch string + globs []string expected bool } data := []testData{ - {[]string{}, "", false}, - {[]string{}, "nonempty", false}, - {[]string{"simple"}, "simple", false}, - {[]string{"simple"}, "some_simple_text", false}, - {[]string{"/bin/ls"}, "/bin/ls", true}, - {[]string{"/bin/ls", "/bin/false"}, "/bin/ls", true}, - {[]string{"/bin/ls", "/bin/false"}, "/bin/false", true}, - {[]string{"/bin/ls", "/bin/false"}, "/bin/bar", false}, - {[]string{"/bin/*ls*"}, "/bin/ls", true}, - {[]string{"/bin/*ls*"}, "/bin/false", true}, - {[]string{"bin/ls"}, "/bin/ls", false}, - {[]string{"./bin/ls"}, "/bin/ls", false}, - {[]string{"*/bin/ls"}, "/bin/ls", false}, + {globs: []string{}, toMatch: "", expected: false}, + {globs: []string{}, toMatch: "nonempty", expected: false}, + {globs: []string{"simple"}, toMatch: "simple", expected: false}, + {globs: []string{"simple"}, toMatch: "some_simple_text", expected: false}, + {globs: []string{"/bin/ls"}, toMatch: "/bin/ls", expected: true}, + {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/ls", expected: true}, + {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/false", expected: true}, + {globs: []string{"/bin/ls", "/bin/false"}, toMatch: "/bin/bar", expected: false}, + {globs: []string{"/bin/*ls*"}, toMatch: "/bin/ls", expected: true}, + {globs: []string{"/bin/*ls*"}, toMatch: "/bin/false", expected: true}, + {globs: []string{"bin/ls"}, toMatch: "/bin/ls", expected: false}, + {globs: []string{"./bin/ls"}, toMatch: "/bin/ls", expected: false}, + {globs: []string{"*/bin/ls"}, toMatch: "/bin/ls", expected: false}, } for _, d := range data { @@ -923,10 +923,10 @@ func TestParseAnnotationUintConfiguration(t *testing.T) { // nolint: govet testCases := []struct { - annotations map[string]string - expected uint64 err error + annotations map[string]string validFunc func(uint64) error + expected uint64 }{ { annotations: map[string]string{key: ""}, @@ -1007,10 +1007,10 @@ func TestParseAnnotationBoolConfiguration(t *testing.T) { // nolint: govet testCases := []struct { + err error annotationKey string annotationValueList []string expected bool - err error }{ { annotationKey: boolKey, @@ -1207,8 +1207,8 @@ func TestNewMount(t *testing.T) { assert := assert.New(t) testCases := []struct { - out vc.Mount in specs.Mount + out vc.Mount }{ { in: specs.Mount{ diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index 35c71a6d61..735b20019f 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -45,10 +45,10 @@ type AcrnState struct { // Acrn is an Hypervisor interface implementation for the Linux acrn hypervisor. type Acrn struct { - sandbox *Sandbox ctx context.Context arch acrnArch store persistapi.PersistDriver + sandbox *Sandbox id string acrnConfig Config config HypervisorConfig diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 6ae99d6732..cec2e1c634 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -246,15 +246,15 @@ func (s *CloudHypervisorState) reset() { } type cloudHypervisor struct { + vmconfig chclient.VmConfig console console.Console virtiofsDaemon VirtiofsDaemon - APIClient clhClient ctx context.Context - id string + APIClient clhClient netDevices *[]chclient.NetConfig devicesIds map[string]string netDevicesFiles map[string][]*os.File - vmconfig chclient.VmConfig + id string state CloudHypervisorState config HypervisorConfig stopped int32 @@ -860,12 +860,12 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { defer cancel() // Create the clh device config via the constructor to ensure default values are properly assigned - clhDevice := *chclient.NewDeviceConfig(*(*device).GetSysfsDev()) + clhDevice := *chclient.NewDeviceConfig(device.SysfsDev) pciInfo, _, err := cl.VmAddDevicePut(ctx, clhDevice) if err != nil { return fmt.Errorf("Failed to hotplug device %+v %s", device, openAPIClientError(err)) } - clh.devicesIds[*(*device).GetID()] = pciInfo.GetId() + clh.devicesIds[device.ID] = pciInfo.GetId() // clh doesn't use bridges, so the PCI path is simply the slot // number of the device. This will break if clh starts using @@ -882,14 +882,11 @@ func (clh *cloudHypervisor) hotPlugVFIODevice(device *config.VFIODev) error { return fmt.Errorf("Unexpected PCI address %q from clh hotplug", pciInfo.Bdf) } - guestPciPath, err := types.PciPathFromString(tokens[0]) - - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { + if device.Type == config.VFIOAPDeviceMediatedType { return fmt.Errorf("VFIO device %+v is not PCI, only PCI is supported in Cloud Hypervisor", device) } - pciDevice.GuestPciPath = guestPciPath - *device = pciDevice + + device.GuestPciPath, err = types.PciPathFromString(tokens[0]) return err } @@ -933,7 +930,7 @@ func (clh *cloudHypervisor) HotplugRemoveDevice(ctx context.Context, devInfo int case BlockDev: deviceID = clhDriveIndexToID(devInfo.(*config.BlockDrive).Index) case VfioDev: - deviceID = *devInfo.(config.VFIODev).GetID() + deviceID = devInfo.(*config.VFIODev).ID default: clh.Logger().WithFields(log.Fields{"devInfo": devInfo, "deviceType": devType}).Error("HotplugRemoveDevice: unsupported device") diff --git a/src/runtime/virtcontainers/clh_test.go b/src/runtime/virtcontainers/clh_test.go index b5c800e956..0ab7ef5b96 100644 --- a/src/runtime/virtcontainers/clh_test.go +++ b/src/runtime/virtcontainers/clh_test.go @@ -188,13 +188,13 @@ func TestCloudHypervisorAddNetCheckEnpointTypes(t *testing.T) { } // nolint: govet tests := []struct { - name string args args + name string wantErr bool }{ - {"TapEndpoint", args{e: &TapEndpoint{}}, true}, - {"Empty VethEndpoint", args{e: &VethEndpoint{}}, true}, - {"Valid VethEndpoint", args{e: validVeth}, false}, + {name: "TapEndpoint", args: args{e: &TapEndpoint{}}, wantErr: true}, + {name: "Empty VethEndpoint", args: args{e: &VethEndpoint{}}, wantErr: true}, + {name: "Valid VethEndpoint", args: args{e: validVeth}, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -673,7 +673,7 @@ func TestCloudHypervisorHotplugRemoveDevice(t *testing.T) { _, err = clh.HotplugRemoveDevice(context.Background(), &config.BlockDrive{}, BlockDev) assert.NoError(err, "Hotplug remove block device expected no error") - _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIOPCIDev{}, VfioDev) + _, err = clh.HotplugRemoveDevice(context.Background(), &config.VFIODev{}, VfioDev) assert.NoError(err, "Hotplug remove vfio block device expected no error") _, err = clh.HotplugRemoveDevice(context.Background(), nil, NetDev) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index 0a490ef577..73dbc6656b 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -505,13 +505,24 @@ type HypervisorConfig struct { // MemOffset specifies memory space for nvdimm device MemOffset uint64 - // PCIeRootPort is used to indicate the number of PCIe Root Port devices - // The PCIe Root Port device is used to hot-plug the PCIe device - PCIeRootPort uint32 + // RawDevics are used to get PCIe device info early before the sandbox + // is started to make better PCIe topology decisions + RawDevices []config.DeviceInfo + + // HotplugVFIO is used to indicate if devices need to be hotplugged on the + // root port or a switch + HotPlugVFIO hv.PCIePort // ColdPlugVFIO is used to indicate if devices need to be coldplugged on the // root port, switch or no port ColdPlugVFIO hv.PCIePort + // PCIeSwitchPort is used to indicate the number of PCIe Switch devices + // The PCIe Switch Port device is sued to hot-plug PCIe devices + PCIeSwitchPort uint32 + + // PCIeRootPort is used to indicate the number of PCIe Root Port devices + // The PCIe Root Port device is used to hot-plug the PCIe device + PCIeRootPort uint32 // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 327f573436..ed2be337c9 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -284,23 +284,16 @@ type KataAgentState struct { // nolint: govet type kataAgent struct { - ctx context.Context - vmSocket interface{} - - client *kataclient.AgentClient - - // lock protects the client pointer - sync.Mutex - - state KataAgentState - + ctx context.Context + vmSocket interface{} + client *kataclient.AgentClient reqHandlers map[string]reqFunc + state KataAgentState kmodules []string - + sync.Mutex dialTimout uint32 - - keepConn bool - dead bool + keepConn bool + dead bool } func (k *kataAgent) Logger() *logrus.Entry { @@ -1137,7 +1130,7 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * ContainerPath: dev.ContainerPath, Type: kataVfioPciDevType, Id: groupNum, - Options: nil, + Options: make([]string, len(devList)), } // We always pass the device information to the agent, since @@ -1147,16 +1140,14 @@ func (k *kataAgent) appendVfioDevice(dev ContainerDevice, device api.Device, c * if c.sandbox.config.VfioMode == config.VFIOModeGuestKernel { kataDevice.Type = kataVfioPciGuestKernelDevType } - - if (*devList[0]).GetType() == config.VFIOAPDeviceMediatedType { - kataDevice.Type = kataVfioApDevType - kataDevice.Options = (*devList[0]).(config.VFIOAPDev).APDevices - } else { - kataDevice.Options = make([]string, len(devList)) - for i, device := range devList { - pciDevice := (*device).(config.VFIOPCIDev) - kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", pciDevice.BDF, pciDevice.GuestPciPath) + for i, dev := range devList { + if dev.Type == config.VFIOAPDeviceMediatedType { + kataDevice.Type = kataVfioApDevType + kataDevice.Options = dev.APDevices + } else { + kataDevice.Options[i] = fmt.Sprintf("0000:%s=%s", dev.BDF, dev.GuestPciPath) } + } return kataDevice @@ -1342,7 +1333,6 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co if _, err = k.sendReq(ctx, req); err != nil { return nil, err } - return buildProcessFromExecID(req.ExecId) } diff --git a/src/runtime/virtcontainers/kata_agent_test.go b/src/runtime/virtcontainers/kata_agent_test.go index c7fa059dcb..8fba818376 100644 --- a/src/runtime/virtcontainers/kata_agent_test.go +++ b/src/runtime/virtcontainers/kata_agent_test.go @@ -235,10 +235,10 @@ func TestHandleDeviceBlockVolume(t *testing.T) { // nolint: govet tests := []struct { - BlockDeviceDriver string - inputMount Mount inputDev *drivers.BlockDevice resultVol *pb.Storage + BlockDeviceDriver string + inputMount Mount }{ { inputDev: &drivers.BlockDevice{ @@ -1024,10 +1024,10 @@ func TestKataAgentKernelParams(t *testing.T) { // nolint: govet type testData struct { + expectedParams []Param + containerPipeSize uint32 debug bool trace bool - containerPipeSize uint32 - expectedParams []Param } debugParam := Param{Key: "agent.log", Value: "debug"} @@ -1036,28 +1036,28 @@ func TestKataAgentKernelParams(t *testing.T) { containerPipeSizeParam := Param{Key: vcAnnotations.ContainerPipeSizeKernelParam, Value: "2097152"} data := []testData{ - {false, false, 0, []Param{}}, + {debug: false, trace: false, containerPipeSize: 0, expectedParams: []Param{}}, // Debug - {true, false, 0, []Param{debugParam}}, + {debug: true, trace: false, containerPipeSize: 0, expectedParams: []Param{debugParam}}, // Tracing - {false, true, 0, []Param{traceParam}}, + {debug: false, trace: true, containerPipeSize: 0, expectedParams: []Param{traceParam}}, // Debug + Tracing - {true, true, 0, []Param{debugParam, traceParam}}, + {debug: true, trace: true, containerPipeSize: 0, expectedParams: []Param{debugParam, traceParam}}, // pipesize - {false, false, 2097152, []Param{containerPipeSizeParam}}, + {debug: false, trace: false, containerPipeSize: 2097152, expectedParams: []Param{containerPipeSizeParam}}, // Debug + pipesize - {true, false, 2097152, []Param{debugParam, containerPipeSizeParam}}, + {debug: true, trace: false, containerPipeSize: 2097152, expectedParams: []Param{debugParam, containerPipeSizeParam}}, // Tracing + pipesize - {false, true, 2097152, []Param{traceParam, containerPipeSizeParam}}, + {debug: false, trace: true, containerPipeSize: 2097152, expectedParams: []Param{traceParam, containerPipeSizeParam}}, // Debug + Tracing + pipesize - {true, true, 2097152, []Param{debugParam, traceParam, containerPipeSizeParam}}, + {debug: true, trace: true, containerPipeSize: 2097152, expectedParams: []Param{debugParam, traceParam, containerPipeSizeParam}}, } for i, d := range data { diff --git a/src/runtime/virtcontainers/monitor.go b/src/runtime/virtcontainers/monitor.go index ae7843beae..75e06fc8f4 100644 --- a/src/runtime/virtcontainers/monitor.go +++ b/src/runtime/virtcontainers/monitor.go @@ -22,15 +22,12 @@ var monitorLog = virtLog.WithField("subsystem", "virtcontainers/monitor") // nolint: govet type monitor struct { - watchers []chan error - sandbox *Sandbox - - wg sync.WaitGroup - sync.Mutex - + sandbox *Sandbox stopCh chan bool + watchers []chan error checkInterval time.Duration - + wg sync.WaitGroup + sync.Mutex running bool } diff --git a/src/runtime/virtcontainers/mount.go b/src/runtime/virtcontainers/mount.go index 6c2e204208..acf4f05f6f 100644 --- a/src/runtime/virtcontainers/mount.go +++ b/src/runtime/virtcontainers/mount.go @@ -239,40 +239,32 @@ func evalMountPath(source, destination string) (string, string, error) { // Mount describes a container mount. // nolint: govet type Mount struct { + // FSGroup a group ID that the group ownership of the files for the mounted volume + // will need to be changed when set. + FSGroup *int // Source is the source of the mount. Source string // Destination is the destination of the mount (within the container). Destination string - - // Type specifies the type of filesystem to mount. - Type string - // HostPath used to store host side bind mount path HostPath string - // GuestDeviceMount represents the path within the VM that the device // is mounted. Only relevant for block devices. This is tracked in the event // runtime wants to query the agent for mount stats. GuestDeviceMount string - // BlockDeviceID represents block device that is attached to the // VM in case this mount is a block device file or a directory // backed by a block device. BlockDeviceID string - - // Options list all the mount options of the filesystem. - Options []string - - // ReadOnly specifies if the mount should be read only or not - ReadOnly bool - - // FSGroup a group ID that the group ownership of the files for the mounted volume - // will need to be changed when set. - FSGroup *int - + // Type specifies the type of filesystem to mount. + Type string // FSGroupChangePolicy specifies the policy that will be used when applying // group id ownership change for a volume. FSGroupChangePolicy volume.FSGroupChangePolicy + // Options list all the mount options of the filesystem. + Options []string + // ReadOnly specifies if the mount should be read only or not + ReadOnly bool } func isSymlink(path string) bool { diff --git a/src/runtime/virtcontainers/nydusd_test.go b/src/runtime/virtcontainers/nydusd_test.go index a8ec6dc9b1..712113015c 100644 --- a/src/runtime/virtcontainers/nydusd_test.go +++ b/src/runtime/virtcontainers/nydusd_test.go @@ -57,13 +57,13 @@ func TestNydusdStart(t *testing.T) { // nolint: govet tests := []struct { - name string fields fields + name string wantErr bool }{ - {"empty config", fields{}, true}, - {"directory source path not exist", SourcePathNoExist, true}, - {"valid config", validConfig, false}, + {name: "empty config", fields: fields{}, wantErr: true}, + {name: "directory source path not exist", fields: SourcePathNoExist, wantErr: true}, + {name: "valid config", fields: validConfig, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/src/runtime/virtcontainers/persist.go b/src/runtime/virtcontainers/persist.go index cbba44e603..574270c80c 100644 --- a/src/runtime/virtcontainers/persist.go +++ b/src/runtime/virtcontainers/persist.go @@ -246,6 +246,7 @@ func (s *Sandbox) dumpConfig(ss *persistapi.SandboxState) { DisableImageNvdimm: sconfig.HypervisorConfig.DisableImageNvdimm, HotplugVFIOOnRootBus: sconfig.HypervisorConfig.HotplugVFIOOnRootBus, PCIeRootPort: sconfig.HypervisorConfig.PCIeRootPort, + PCIeSwitchPort: sconfig.HypervisorConfig.PCIeSwitchPort, BootToBeTemplate: sconfig.HypervisorConfig.BootToBeTemplate, BootFromTemplate: sconfig.HypervisorConfig.BootFromTemplate, DisableVhostNet: sconfig.HypervisorConfig.DisableVhostNet, diff --git a/src/runtime/virtcontainers/persist/api/config.go b/src/runtime/virtcontainers/persist/api/config.go index e4facc6b98..6457478c58 100644 --- a/src/runtime/virtcontainers/persist/api/config.go +++ b/src/runtime/virtcontainers/persist/api/config.go @@ -135,6 +135,10 @@ type HypervisorConfig struct { // The PCIe Root Port device is used to hot-plug the PCIe device PCIeRootPort uint32 + // PCIeSwitchPort is used to indicate the number of PCIe Switch Downstream Port + // devices. The PCIe Switch Downstream Port is used to hot-plug the PCIe devices. + PCIeSwitchPort uint32 + // NumVCPUs specifies default number of vCPUs for the VM. NumVCPUs uint32 diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index d856490e5b..05476ae85e 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -75,13 +75,14 @@ type qmpChannel struct { // QemuState keeps Qemu's state type QemuState struct { - UUID string - Bridges []types.Bridge - // HotpluggedCPUs is the list of CPUs that were hot-added + UUID string + HotPlugVFIO hv.PCIePort + Bridges []types.Bridge HotpluggedVCPUs []hv.CPUDevice HotpluggedMemory int VirtiofsDaemonPid int PCIeRootPort int + PCIeSwitchPort int HotplugVFIOOnRootBus bool ColdPlugVFIO hv.PCIePort } @@ -286,6 +287,8 @@ func (q *qemu) setup(ctx context.Context, id string, hypervisorConfig *Hyperviso q.state.ColdPlugVFIO = q.config.ColdPlugVFIO q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus q.state.PCIeRootPort = int(q.config.PCIeRootPort) + q.state.PCIeSwitchPort = int(q.config.PCIeSwitchPort) + q.state.HotPlugVFIO = q.config.HotPlugVFIO // The path might already exist, but in case of VM templating, // we have to create it since the sandbox has not created it yet. @@ -701,25 +704,11 @@ func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervi } } - // Add PCIe Root Port devices to hypervisor - // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port. - // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt - memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() - - if hypervisorConfig.PCIeRootPort > 0 { - qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort, memSize32bit, memSize64bit) - } - - // The default OVMF MMIO aperture is too small for some PCIe devices - // with huge BARs so we need to increase it. - // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string - if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { - pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) - fwCfg := govmmQemu.FwCfg{ - Name: "opt/ovmf/X-PciMmio64Mb", - Str: pciMmio64Mb, + if machine.Type == QemuQ35 { + if err := q.createPCIeTopology(&qemuConfig, hypervisorConfig); err != nil { + q.Logger().WithError(err).Errorf("Cannot create PCIe topology") + return err } - qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) } q.qemuConfig = qemuConfig @@ -747,6 +736,104 @@ func (q *qemu) checkBpfEnabled() { } } +// If a user uses 8 GPUs with 4 devices in each IOMMU Group that means we need +// to hotplug 32 devices. We do not have enough PCIe root bus slots to +// accomplish this task. Kata will use already some slots for vfio-xxxx-pci +// devices. +// Max PCI slots per root bus is 32 +// Max PCIe root ports is 16 +// Max PCIe switch ports is 16 +// There is only 64kB of IO memory each root,switch port will consume 4k hence +// only 16 ports possible. +func (q *qemu) createPCIeTopology(qemuConfig *govmmQemu.Config, hypervisorConfig *HypervisorConfig) error { + // We do not need to do anything if we want to hotplug a VFIO to a + // pcie-pci-bridge, just return + if hypervisorConfig.HotPlugVFIO == hv.BridgePort { + return nil + } + // Add PCIe Root Port or PCIe Switches to the hypervisor + // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged + // into a PCIe Root Port or PCIe Switch. + // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt + + // Deduce the right values for mem-reserve and pref-64-reserve memory regions + memSize32bit, memSize64bit := q.arch.getBARsMaxAddressableMemory() + + // The default OVMF MMIO aperture is too small for some PCIe devices + // with huge BARs so we need to increase it. + // memSize64bit is in bytes, convert to MB, OVMF expects MB as a string + if strings.Contains(strings.ToLower(hypervisorConfig.FirmwarePath), "ovmf") { + pciMmio64Mb := fmt.Sprintf("%d", (memSize64bit / 1024 / 1024)) + fwCfg := govmmQemu.FwCfg{ + Name: "opt/ovmf/X-PciMmio64Mb", + Str: pciMmio64Mb, + } + qemuConfig.FwCfg = append(qemuConfig.FwCfg, fwCfg) + } + + // Get the number of hotpluggable ports needed from the provided devices + var numOfHotPluggablePorts uint32 = 0 + for _, dev := range hypervisorConfig.RawDevices { + hostPath, _ := config.GetHostPath(dev, false, "") + if hostPath == "" { + continue + } + + vfioNumberOrContainer := filepath.Base(hostPath) + // If we want to have VFIO inside of the VM we need to passthrough + // additionally /dev/vfio/vfio which is the VFIO "container". + // Ignore it and handle the remaining devices. + if strings.Compare(vfioNumberOrContainer, "vfio") == 0 { + continue + } + iommuDevicesPath := filepath.Join(config.SysIOMMUGroupPath, vfioNumberOrContainer, "devices") + deviceFiles, err := os.ReadDir(iommuDevicesPath) + if err != nil { + return err + } + + for _, deviceFile := range deviceFiles { + deviceBDF, _, _, err := drivers.GetVFIODetails(deviceFile.Name(), iommuDevicesPath) + if err != nil { + return err + } + if drivers.IsPCIeDevice(deviceBDF) { + numOfHotPluggablePorts = numOfHotPluggablePorts + 1 + } + } + } + + // If number of PCIe root ports > 16 then bail out otherwise we may + // use up all slots or IO memory on the root bus and vfio-XXX-pci devices + // cannot be added which are crucial for Kata max slots on root bus is 32 + // max slots on the complete pci(e) topology is 256 in QEMU + if hypervisorConfig.PCIeRootPort > maxPCIeRootPort { + return fmt.Errorf("Number of PCIe Root Ports exceeed allowed max of %d", maxPCIeRootPort) + } + if hypervisorConfig.PCIeSwitchPort > maxPCIeSwitchPort { + return fmt.Errorf("Number of PCIe Switch Ports exceeed allowed max of %d", maxPCIeRootPort) + } + + // If the user provided more root ports than we have detected + // use the user provided number of PCIe root ports + if numOfHotPluggablePorts < hypervisorConfig.PCIeRootPort { + numOfHotPluggablePorts = hypervisorConfig.PCIeRootPort + } + // If the user provided more switch ports than we have detected + // use the user provided number of PCIe root ports + if numOfHotPluggablePorts < hypervisorConfig.PCIeSwitchPort { + numOfHotPluggablePorts = hypervisorConfig.PCIeSwitchPort + } + + if q.state.HotPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, numOfHotPluggablePorts, memSize32bit, memSize64bit) + } + if q.state.HotPlugVFIO == hv.SwitchPort { + qemuConfig.Devices = q.arch.appendPCIeSwitchPortDevice(qemuConfig.Devices, numOfHotPluggablePorts, memSize32bit, memSize64bit) + } + return nil +} + func (q *qemu) vhostFSSocketPath(id string) (string, error) { return utils.BuildSocketPath(q.config.VMStorePath, id, vhostFSSocket) } @@ -1552,10 +1639,10 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V //Since the dev is the first and only one on this bus(root port), it should be 0. addr := "00" - bridgeId := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) + bridgeID := fmt.Sprintf("%s%d", pcieRootPortPrefix, len(drivers.AllPCIeDevs)) drivers.AllPCIeDevs[devID] = true - bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeId) + bridgeQomPath := fmt.Sprintf("%s%s", qomPathPrefix, bridgeID) bridgeSlot, err := q.qomGetSlot(bridgeQomPath) if err != nil { return err @@ -1571,7 +1658,7 @@ func (q *qemu) hotplugAddVhostUserBlkDevice(ctx context.Context, vAttr *config.V return err } - if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeId); err != nil { + if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridgeID); err != nil { return err } @@ -1685,41 +1772,122 @@ func (q *qemu) qomGetSlot(qomPath string) (types.PciSlot, error) { // Query QMP to find a device's PCI path given its QOM path or ID func (q *qemu) qomGetPciPath(qemuID string) (types.PciPath, error) { - // XXX: For now we assume there's exactly one bridge, since - // that's always how we configure qemu from Kata for now. It - // would be good to generalize this to different PCI - // topologies + + var slots []types.PciSlot + devSlot, err := q.qomGetSlot(qemuID) if err != nil { return types.PciPath{}, err } + slots = append(slots, devSlot) - busq, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, qemuID, "parent_bus") + var parentPath = qemuID + // We do not want to use a forever loop here, a deeper PCIe topology + // than 5 is already not advisable just for the sake of having enough + // buffer we limit ourselves to 10 and leave the loop early if we hit + // the root bus. + for i := 1; i <= 10; i++ { + parenBusQOM, err := q.qmpMonitorCh.qmp.ExecQomGet(q.qmpMonitorCh.ctx, parentPath, "parent_bus") + if err != nil { + return types.PciPath{}, err + } + + busQOM, ok := parenBusQOM.(string) + if !ok { + return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, parenBusQOM) + } + + // If we hit /machine/q35/pcie.0 we're done this is the root bus + // we climbed the complete hierarchy + if strings.Contains(busQOM, "/machine/q35/pcie.0") { + break + } + + // `bus` is the QOM path of the QOM bus object, but we need + // the PCI parent_bus which manages that bus. There doesn't seem + // to be a way to get that other than to simply drop the last + // path component. + idx := strings.LastIndex(busQOM, "/") + if idx == -1 { + return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", busQOM) + } + parentBus := busQOM[:idx] + + parentSlot, err := q.qomGetSlot(parentBus) + if err != nil { + return types.PciPath{}, err + } + + // Prepend the slots, since we're climbing the hierarchy + slots = append([]types.PciSlot{parentSlot}, slots...) + parentPath = parentBus + } + return types.PciPathFromSlots(slots...) +} + +func (q *qemu) hotplugVFIODeviceRootPort(ctx context.Context, device *config.VFIODev) (err error) { + + if device.IsPCIe && (q.state.PCIeRootPort <= 0) { + q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device." + + "It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") + return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", + q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) + } + + device.Bus = fmt.Sprintf("%s%d", pcieRootPortPrefix, device.Rank) + return q.executeVFIODeviceAdd(device) +} + +func (q *qemu) hotplugVFIODeviceSwitchPort(ctx context.Context, device *config.VFIODev) (err error) { + + if device.IsPCIe && (q.state.PCIeSwitchPort <= 0) { + q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device." + + "It's recommended to add the PCIe Switch Port by setting the pcie_switch_port parameter in the configuration for q35") + return fmt.Errorf("VFIO device is a PCIe device. Hotplug (%v) only supported on PCIe Root (%d) or PCIe Switch Ports (%v)", + q.state.HotPlugVFIO, q.state.PCIeRootPort, q.state.PCIeSwitchPort) + } + device.Bus = fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, device.Rank) + return q.executeVFIODeviceAdd(device) +} + +func (q *qemu) hotplugVFIODeviceBridgePort(ctx context.Context, device *config.VFIODev) (err error) { + addr, bridge, err := q.arch.addDeviceToBridge(ctx, device.ID, types.PCI) if err != nil { - return types.PciPath{}, err + return err } - bus, ok := busq.(string) - if !ok { - return types.PciPath{}, fmt.Errorf("parent_bus QOM property of %s is %t not a string", qemuID, busq) - } + defer func() { + if err != nil { + q.arch.removeDeviceFromBridge(device.ID) + } + }() + return q.executePCIVFIODeviceAdd(device, addr, bridge.ID) +} - // `bus` is the QOM path of the QOM bus object, but we need - // the PCI bridge which manages that bus. There doesn't seem - // to be a way to get that other than to simply drop the last - // path component. - idx := strings.LastIndex(bus, "/") - if idx == -1 { - return types.PciPath{}, fmt.Errorf("Bus has unexpected QOM path %s", bus) +func (q *qemu) executePCIVFIODeviceAdd(device *config.VFIODev, addr string, bridgeID string) error { + switch device.Type { + case config.VFIOPCIDeviceNormalType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, addr, bridgeID, romFile) + case config.VFIOPCIDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, addr, bridgeID, romFile) + case config.VFIOAPDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + default: + return fmt.Errorf("Incorrect VFIO device type found") } - bridge := bus[:idx] +} - bridgeSlot, err := q.qomGetSlot(bridge) - if err != nil { - return types.PciPath{}, err +func (q *qemu) executeVFIODeviceAdd(device *config.VFIODev) error { + switch device.Type { + case config.VFIOPCIDeviceNormalType: + return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.BDF, device.Bus, romFile) + case config.VFIOPCIDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.ID, device.SysfsDev, "", device.Bus, romFile) + case config.VFIOAPDeviceMediatedType: + return q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, device.SysfsDev) + default: + return fmt.Errorf("Incorrect VFIO device type found") } - - return types.PciPathFromSlots(bridgeSlot, devSlot) } func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op Operation) (err error) { @@ -1727,109 +1895,56 @@ func (q *qemu) hotplugVFIODevice(ctx context.Context, device *config.VFIODev, op return err } - devID := *(*device).GetID() - machineType := q.HypervisorConfig().HypervisorMachineType - if op == AddDevice { buf, _ := json.Marshal(device) q.Logger().WithFields(logrus.Fields{ - "machine-type": machineType, - "hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus, - "pcie-root-port": q.state.PCIeRootPort, - "device-info": string(buf), + "machine-type": q.HypervisorConfig().HypervisorMachineType, + "hotplug-vfio": q.state.HotPlugVFIO, + "pcie-root-port": q.state.PCIeRootPort, + "pcie-switch-port": q.state.PCIeSwitchPort, + "device-info": string(buf), }).Info("Start hot-plug VFIO device") - - // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus - // for pc machine type instead of bridge. This is useful for devices that require - // a large PCI BAR which is a currently a limitation with PCI bridges. - if q.state.HotplugVFIOOnRootBus { - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - // In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port. - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - switch machineType { - case QemuQ35: - if pciDevice.IsPCIe && q.state.PCIeRootPort <= 0 { - q.Logger().WithField("dev-id", (*device).GetID()).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") - pciDevice.Bus = "" - } - default: - pciDevice.Bus = "" - } - *device = pciDevice - - if pciDevice.Type == config.VFIOPCIDeviceNormalType { - err = q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, pciDevice.Bus, romFile) - } else { - err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), "", pciDevice.Bus, romFile) - } - case config.VFIOAPDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev()) - } + // In case MachineType is q35, a PCIe device is hotplugged on + // a PCIe Root Port or alternatively on a PCIe Switch Port + if q.HypervisorConfig().HypervisorMachineType != QemuQ35 { + device.Bus = "" } else { - addr, bridge, err := q.arch.addDeviceToBridge(ctx, devID, types.PCI) + var err error + // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus + // for pc machine type instead of bridge. This is useful for devices that require + // a large PCI BAR which is a currently a limitation with PCI bridges. + if q.state.HotPlugVFIO == hv.RootPort || q.state.HotplugVFIOOnRootBus { + err = q.hotplugVFIODeviceRootPort(ctx, device) + } else if q.state.HotPlugVFIO == hv.SwitchPort { + err = q.hotplugVFIODeviceSwitchPort(ctx, device) + } else { + err = q.hotplugVFIODeviceBridgePort(ctx, device) + } if err != nil { return err } - - defer func() { - if err != nil { - q.arch.removeDeviceFromBridge(devID) - } - }() - - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType: - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - err = q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, pciDevice.BDF, addr, bridge.ID, romFile) - case config.VFIOPCIDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, *(*device).GetSysfsDev(), addr, bridge.ID, romFile) - case config.VFIOAPDeviceMediatedType: - err = q.qmpMonitorCh.qmp.ExecuteAPVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, *(*device).GetSysfsDev()) - default: - return fmt.Errorf("Incorrect VFIO device type found") - } - } - if err != nil { - return err - } - - switch (*device).GetType() { - case config.VFIOPCIDeviceNormalType, config.VFIOPCIDeviceMediatedType: - pciDevice, ok := (*device).(config.VFIOPCIDev) - if !ok { - return fmt.Errorf("VFIO device %+v is not PCI, but its Type said otherwise", device) - } - // XXX: Depending on whether we're doing root port or - // bridge hotplug, and how the bridge is set up in - // other parts of the code, we may or may not already - // have information about the slot number of the - // bridge and or the device. For simplicity, just - // query both of them back from qemu - guestPciPath, err := q.qomGetPciPath(devID) - pciDevice.GuestPciPath = guestPciPath - *device = pciDevice - return err } + // XXX: Depending on whether we're doing root port or + // bridge hotplug, and how the bridge is set up in + // other parts of the code, we may or may not already + // have information about the slot number of the + // bridge and or the device. For simplicity, just + // query both of them back from qemu + device.GuestPciPath, err = q.qomGetPciPath(device.ID) return err - } else { - q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") - - if !q.state.HotplugVFIOOnRootBus { - if err := q.arch.removeDeviceFromBridge(devID); err != nil { - return err - } - } - - return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID) } + + q.Logger().WithField("dev-id", device.ID).Info("Start hot-unplug VFIO device") + + if !q.state.HotplugVFIOOnRootBus { + if err := q.arch.removeDeviceFromBridge(device.ID); err != nil { + return err + } + } + + return q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, device.ID) + } func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { @@ -2541,6 +2656,79 @@ func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machin return devices } +// gollangci-lint enforces multi-line comments to be a block comment +// not multiple single line comments ... +/* pcie.0 bus +// ------------------------------------------------- +// | +// ------------- +// | Root Port | +// ------------- +// -------------------------|------------------------ +// | ----------------- | +// | PCI Express | Upstream Port | | +// | Switch ----------------- | +// | | | | +// | ------------------- ------------------- | +// | | Downstream Port | | Downstream Port | | +// | ------------------- ------------------- | +// -------------|-----------------------|------------ +// ------------- -------------- +// | GPU/ACCEL | | IB/ETH NIC | +// ------------- -------------- +*/ +// genericAppendPCIeSwitch adds a PCIe Swtich +func genericAppendPCIeSwitchPort(devices []govmmQemu.Device, number uint32, machineType string, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { + + // Q35 has the correct PCIe support, + // hence ignore all other machines + if machineType != QemuQ35 { + return devices + } + + // Using an own ID for the root port, so we do not clash with already + // existing root ports adding "s" for switch prefix + pcieRootPort := govmmQemu.PCIeRootPortDevice{ + ID: fmt.Sprintf("%s%s%d", pcieSwitchPrefix, pcieRootPortPrefix, 0), + Bus: defaultBridgeBus, + Chassis: "0", + Slot: strconv.FormatUint(uint64(0), 10), + Multifunction: false, + Addr: "0", + MemReserve: fmt.Sprintf("%dB", memSize32bit), + Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), + } + + devices = append(devices, pcieRootPort) + + pcieSwitchUpstreamPort := govmmQemu.PCIeSwitchUpstreamPortDevice{ + ID: fmt.Sprintf("%s%d", pcieSwitchUpstreamPortPrefix, 0), + Bus: pcieRootPort.ID, + } + devices = append(devices, pcieSwitchUpstreamPort) + + currentChassis, err := strconv.Atoi(pcieRootPort.Chassis) + if err != nil { + return devices + } + nextChassis := currentChassis + 1 + + for i := uint32(0); i < number; i++ { + + pcieSwitchDownstreamPort := govmmQemu.PCIeSwitchDownstreamPortDevice{ + ID: fmt.Sprintf("%s%d", pcieSwitchDownstreamPortPrefix, i), + Bus: pcieSwitchUpstreamPort.ID, + Chassis: fmt.Sprintf("%d", nextChassis), + Slot: strconv.FormatUint(uint64(i), 10), + // TODO: MemReserve: fmt.Sprintf("%dB", memSize32bit), + // TODO: Pref64Reserve: fmt.Sprintf("%dB", memSize64bit), + } + devices = append(devices, pcieSwitchDownstreamPort) + } + + return devices +} + func (q *qemu) GetThreadIDs(ctx context.Context) (VcpuThreadIDs, error) { span, _ := katatrace.Trace(ctx, q.Logger(), "GetThreadIDs", qemuTracingTags, map[string]string{"sandbox_id": q.id}) defer span.End() @@ -2717,6 +2905,7 @@ func (q *qemu) Save() (s hv.HypervisorState) { s.HotpluggedMemory = q.state.HotpluggedMemory s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus s.PCIeRootPort = q.state.PCIeRootPort + s.PCIeSwitchPort = q.state.PCIeSwitchPort for _, bridge := range q.arch.getBridges() { s.Bridges = append(s.Bridges, hv.Bridge{ @@ -2741,6 +2930,7 @@ func (q *qemu) Load(s hv.HypervisorState) { q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus q.state.VirtiofsDaemonPid = s.VirtiofsDaemonPid q.state.PCIeRootPort = s.PCIeRootPort + q.state.PCIeSwitchPort = s.PCIeSwitchPort for _, bridge := range s.Bridges { q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) diff --git a/src/runtime/virtcontainers/qemu_amd64.go b/src/runtime/virtcontainers/qemu_amd64.go index 4ecf0804a4..cb99ec1ace 100644 --- a/src/runtime/virtcontainers/qemu_amd64.go +++ b/src/runtime/virtcontainers/qemu_amd64.go @@ -123,7 +123,7 @@ func newQemuArch(config HypervisorConfig) (qemuArch, error) { legacySerial: config.LegacySerial, }, vmFactory: factory, - snpGuest: config.SevSnpGuest, + snpGuest: config.ConfidentialGuest, } if config.ConfidentialGuest { diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go index 9aff1e76c2..9a288d85a7 100644 --- a/src/runtime/virtcontainers/qemu_arch_base.go +++ b/src/runtime/virtcontainers/qemu_arch_base.go @@ -140,6 +140,9 @@ type qemuArch interface { // appendPCIeRootPortDevice appends a pcie-root-port device to pcie.0 bus appendPCIeRootPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device + // appendPCIeSwitch appends a ioh3420 device to a pcie-root-port + appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device + // append vIOMMU device appendIOMMU(devices []govmmQemu.Device) ([]govmmQemu.Device, error) @@ -177,13 +180,18 @@ type qemuArchBase struct { } const ( - defaultCores uint32 = 1 - defaultThreads uint32 = 1 - defaultCPUModel = "host" - defaultBridgeBus = "pcie.0" - defaultPCBridgeBus = "pci.0" - maxDevIDSize = 31 - pcieRootPortPrefix = "rp" + defaultCores uint32 = 1 + defaultThreads uint32 = 1 + defaultCPUModel = "host" + defaultBridgeBus = "pcie.0" + defaultPCBridgeBus = "pci.0" + maxDevIDSize = 31 + maxPCIeRootPort = 16 // Limitation from QEMU + maxPCIeSwitchPort = 16 // Limitation from QEMU + pcieRootPortPrefix = "rp" + pcieSwitchPrefix = "sw" + pcieSwitchUpstreamPortPrefix = "swup" + pcieSwitchDownstreamPortPrefix = "swdp" ) // This is the PCI start address assigned to the first bridge that @@ -675,17 +683,17 @@ func (q *qemuArchBase) appendVhostUserDevice(ctx context.Context, devices []govm } func (q *qemuArchBase) appendVFIODevice(devices []govmmQemu.Device, vfioDev config.VFIODev) []govmmQemu.Device { - pciDevice := vfioDev.(config.VFIOPCIDev) - if pciDevice.BDF == "" { + + if vfioDev.BDF == "" { return devices } devices = append(devices, govmmQemu.VFIODevice{ - BDF: pciDevice.BDF, - VendorID: pciDevice.VendorID, - DeviceID: pciDevice.DeviceID, - Bus: pciDevice.Bus, + BDF: vfioDev.BDF, + VendorID: vfioDev.VendorID, + DeviceID: vfioDev.DeviceID, + Bus: vfioDev.Bus, }, ) @@ -801,6 +809,13 @@ func (q *qemuArchBase) appendPCIeRootPortDevice(devices []govmmQemu.Device, numb return genericAppendPCIeRootPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit) } +// appendPCIeSwitchPortDevice appends a PCIe Switch with ports +func (q *qemuArchBase) appendPCIeSwitchPortDevice(devices []govmmQemu.Device, number uint32, memSize32bit uint64, memSize64bit uint64) []govmmQemu.Device { + return genericAppendPCIeSwitchPort(devices, number, q.qemuMachine.Type, memSize32bit, memSize64bit) +} + +// getBARsMaxAddressableMemory we need to know the BAR sizes to configure the +// PCIe Root Port or PCIe Downstream Port attaching a device with huge BARs. func (q *qemuArchBase) getBARsMaxAddressableMemory() (uint64, uint64) { pci := nvpci.New() diff --git a/src/runtime/virtcontainers/qemu_arch_base_test.go b/src/runtime/virtcontainers/qemu_arch_base_test.go index 51c11bd91d..37611bb5bb 100644 --- a/src/runtime/virtcontainers/qemu_arch_base_test.go +++ b/src/runtime/virtcontainers/qemu_arch_base_test.go @@ -463,7 +463,7 @@ func TestQemuArchBaseAppendVFIODevice(t *testing.T) { }, } - vfDevice := config.VFIOPCIDev{ + vfDevice := config.VFIODev{ BDF: bdf, } @@ -483,7 +483,7 @@ func TestQemuArchBaseAppendVFIODeviceWithVendorDeviceID(t *testing.T) { }, } - vfDevice := config.VFIOPCIDev{ + vfDevice := config.VFIODev{ BDF: bdf, VendorID: vendorID, DeviceID: deviceID, diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 9bd5f402eb..bda931b435 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -101,15 +101,10 @@ type HypervisorPidKey struct{} // SandboxStatus describes a sandbox status. type SandboxStatus struct { - ContainersStatus []ContainerStatus - - // Annotations allow clients to store arbitrary values, - // for example to add additional status values required - // to support particular specifications. - Annotations map[string]string - + Annotations map[string]string ID string Hypervisor HypervisorType + ContainersStatus []ContainerStatus State types.SandboxState HypervisorConfig HypervisorConfig } @@ -176,10 +171,8 @@ type SandboxConfig struct { // SharePidNs sets all containers to share the same sandbox level pid namespace. SharePidNs bool - // SystemdCgroup enables systemd cgroup support SystemdCgroup bool - // SandboxCgroupOnly enables cgroup only at podlevel in the host SandboxCgroupOnly bool @@ -620,6 +613,12 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor if err := validateHypervisorConfig(&sandboxConfig.HypervisorConfig); err != nil { return nil, err } + // Aggregate all the container devices and update the HV config + var devices []config.DeviceInfo + for _, ct := range sandboxConfig.Containers { + devices = append(devices, ct.DeviceInfos...) + } + sandboxConfig.HypervisorConfig.RawDevices = devices // If we have a confidential guest we need to cold-plug the PCIe VFIO devices // until we have TDISP/IDE PCIe support. @@ -1709,7 +1708,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { defer span.End() for i := range s.config.Containers { - c, err := newContainer(ctx, s, &s.config.Containers[i]) if err != nil { return err @@ -1728,7 +1726,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.updateResources(ctx); err != nil { return err } - if err := s.resourceControllerUpdate(ctx); err != nil { return err } @@ -1740,7 +1737,6 @@ func (s *Sandbox) createContainers(ctx context.Context) error { if err := s.storeSandbox(ctx); err != nil { return err } - return nil } @@ -1904,15 +1900,11 @@ func (s *Sandbox) HotplugAddDevice(ctx context.Context, device api.Device, devTy // adding a group of VFIO devices for _, dev := range vfioDevices { if _, err := s.hypervisor.HotplugAddDevice(ctx, dev, VfioDev); err != nil { - bdf := "" - if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDevice.BDF - } s.Logger(). WithFields(logrus.Fields{ "sandbox": s.id, - "vfio-device-ID": (*dev).GetID(), - "vfio-device-BDF": bdf, + "vfio-device-ID": dev.ID, + "vfio-device-BDF": dev.BDF, }).WithError(err).Error("failed to hotplug VFIO device") return err } @@ -1961,15 +1953,11 @@ func (s *Sandbox) HotplugRemoveDevice(ctx context.Context, device api.Device, de // remove a group of VFIO devices for _, dev := range vfioDevices { if _, err := s.hypervisor.HotplugRemoveDevice(ctx, dev, VfioDev); err != nil { - bdf := "" - if pciDevice, ok := (*dev).(config.VFIOPCIDev); ok { - bdf = pciDevice.BDF - } s.Logger().WithError(err). WithFields(logrus.Fields{ "sandbox": s.id, - "vfio-device-ID": (*dev).GetID(), - "vfio-device-BDF": bdf, + "vfio-device-ID": dev.ID, + "vfio-device-BDF": dev.BDF, }).Error("failed to hot unplug VFIO device") return err } diff --git a/src/runtime/virtcontainers/sandbox_test.go b/src/runtime/virtcontainers/sandbox_test.go index de3b1885c6..db1aa7f84e 100644 --- a/src/runtime/virtcontainers/sandbox_test.go +++ b/src/runtime/virtcontainers/sandbox_test.go @@ -593,11 +593,11 @@ func TestSandboxAttachDevicesVFIO(t *testing.T) { _, err = os.Create(deviceFile) assert.Nil(t, err) - savedIOMMUPath := config.SysIOMMUPath - config.SysIOMMUPath = tmpDir + savedIOMMUPath := config.SysIOMMUGroupPath + config.SysIOMMUGroupPath = tmpDir defer func() { - config.SysIOMMUPath = savedIOMMUPath + config.SysIOMMUGroupPath = savedIOMMUPath }() dm := manager.NewDeviceManager(config.VirtioSCSI, false, "", 0, nil) @@ -650,8 +650,8 @@ func TestSandboxCreateAssets(t *testing.T) { // nolint: govet type testData struct { - assetType types.AssetType annotations map[string]string + assetType types.AssetType } tmpfile, err := os.CreateTemp("", "virtcontainers-test-") @@ -687,50 +687,50 @@ func TestSandboxCreateAssets(t *testing.T) { data := []testData{ { - types.FirmwareAsset, - map[string]string{ + assetType: types.FirmwareAsset, + annotations: map[string]string{ annotations.FirmwarePath: filename, annotations.FirmwareHash: assetContentHash, }, }, { - types.HypervisorAsset, - map[string]string{ + assetType: types.HypervisorAsset, + annotations: map[string]string{ annotations.HypervisorPath: filename, annotations.HypervisorHash: assetContentHash, }, }, { - types.HypervisorCtlAsset, - map[string]string{ + assetType: types.HypervisorCtlAsset, + annotations: map[string]string{ annotations.HypervisorCtlPath: filename, annotations.HypervisorCtlHash: assetContentHash, }, }, { - types.ImageAsset, - map[string]string{ + assetType: types.ImageAsset, + annotations: map[string]string{ annotations.ImagePath: filename, annotations.ImageHash: assetContentHash, }, }, { - types.InitrdAsset, - map[string]string{ + assetType: types.InitrdAsset, + annotations: map[string]string{ annotations.InitrdPath: filename, annotations.InitrdHash: assetContentHash, }, }, { - types.JailerAsset, - map[string]string{ + assetType: types.JailerAsset, + annotations: map[string]string{ annotations.JailerPath: filename, annotations.JailerHash: assetContentHash, }, }, { - types.KernelAsset, - map[string]string{ + assetType: types.KernelAsset, + annotations: map[string]string{ annotations.KernelPath: filename, annotations.KernelHash: assetContentHash, }, @@ -1407,58 +1407,58 @@ func TestSandbox_Cgroups(t *testing.T) { // nolint: govet tests := []struct { - name string s *Sandbox + name string wantErr bool needRoot bool }{ { - "New sandbox", - &Sandbox{}, - true, - false, + name: "New sandbox", + s: &Sandbox{}, + wantErr: true, + needRoot: false, }, { - "New sandbox, new config", - &Sandbox{config: &SandboxConfig{}}, - false, - true, + name: "New sandbox, new config", + s: &Sandbox{config: &SandboxConfig{}}, + wantErr: false, + needRoot: true, }, { - "sandbox, container no sandbox type", - &Sandbox{ + name: "sandbox, container no sandbox type", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ {}, }}}, - false, - true, + wantErr: false, + needRoot: true, }, { - "sandbox, container sandbox type", - &Sandbox{ + name: "sandbox, container sandbox type", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ sandboxContainer, }}}, - false, - true, + wantErr: false, + needRoot: true, }, { - "sandbox, empty linux json", - &Sandbox{ + name: "sandbox, empty linux json", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ emptyJSONLinux, }}}, - false, - true, + wantErr: false, + needRoot: true, }, { - "sandbox, successful config", - &Sandbox{ + name: "sandbox, successful config", + s: &Sandbox{ config: &SandboxConfig{Containers: []ContainerConfig{ successfulContainer, }}}, - false, - true, + wantErr: false, + needRoot: true, }, } for _, tt := range tests { diff --git a/src/runtime/virtcontainers/veth_endpoint_test.go b/src/runtime/virtcontainers/veth_endpoint_test.go index dc6f03270a..341185b006 100644 --- a/src/runtime/virtcontainers/veth_endpoint_test.go +++ b/src/runtime/virtcontainers/veth_endpoint_test.go @@ -85,16 +85,16 @@ func TestCreateVethNetworkEndpointChooseIfaceName(t *testing.T) { func TestCreateVethNetworkEndpointInvalidArgs(t *testing.T) { // nolint: govet type endpointValues struct { - idx int ifName string + idx int } assert := assert.New(t) // all elements are expected to result in failure failingValues := []endpointValues{ - {-1, "bar"}, - {-1, ""}, + {idx: -1, ifName: "bar"}, + {idx: -1, ifName: ""}, } for _, d := range failingValues { diff --git a/src/runtime/virtcontainers/virtiofsd_test.go b/src/runtime/virtcontainers/virtiofsd_test.go index a4252a2ba9..55eb3fb1a3 100644 --- a/src/runtime/virtcontainers/virtiofsd_test.go +++ b/src/runtime/virtcontainers/virtiofsd_test.go @@ -17,13 +17,13 @@ import ( func TestVirtiofsdStart(t *testing.T) { // nolint: govet type fields struct { - path string + ctx context.Context socketPath string cache string - extraArgs []string sourcePath string + path string + extraArgs []string PID int - ctx context.Context } sourcePath := t.TempDir() @@ -41,13 +41,13 @@ func TestVirtiofsdStart(t *testing.T) { // nolint: govet tests := []struct { - name string fields fields + name string wantErr bool }{ - {"empty config", fields{}, true}, - {"Directory socket does not exist", NoDirectorySocket, true}, - {"valid config", validConfig, false}, + {name: "empty config", fields: fields{}, wantErr: true}, + {name: "Directory socket does not exist", fields: NoDirectorySocket, wantErr: true}, + {name: "valid config", fields: validConfig, wantErr: false}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/tools/osbuilder/image-builder/image_builder.sh b/tools/osbuilder/image-builder/image_builder.sh index 3e7f0babc0..f88f3aac04 100755 --- a/tools/osbuilder/image-builder/image_builder.sh +++ b/tools/osbuilder/image-builder/image_builder.sh @@ -271,7 +271,7 @@ calculate_required_disk_size() { readonly image="$(mktemp)" readonly mount_dir="$(mktemp -d)" readonly max_tries=20 - readonly increment=10 + readonly increment=100 for i in $(seq 1 $max_tries); do local img_size="$((rootfs_size_mb + (i * increment)))" diff --git a/tools/packaging/scripts/configure-hypervisor.sh b/tools/packaging/scripts/configure-hypervisor.sh index 8bdfc94de1..9dfc9bb321 100755 --- a/tools/packaging/scripts/configure-hypervisor.sh +++ b/tools/packaging/scripts/configure-hypervisor.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash # # Copyright (c) 2018 Intel Corporation # @@ -14,6 +14,8 @@ # been specified. #--------------------------------------------------------------------- +set -x + script_name=${0##*/} arch="${3:-$(uname -m)}"