mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-07-05 19:47:53 +00:00
qemu: Add CDI device handling for any container type
We need special handling for pod_sandbox, pod_container and single_container how and when to inject CDI devices Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
parent
c7b41361b2
commit
4c93bb2d61
@ -19,11 +19,11 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
|
||||||
taskAPI "github.com/containerd/containerd/api/runtime/task/v2"
|
taskAPI "github.com/containerd/containerd/api/runtime/task/v2"
|
||||||
containerd_types "github.com/containerd/containerd/api/types"
|
containerd_types "github.com/containerd/containerd/api/types"
|
||||||
"github.com/containerd/containerd/mount"
|
"github.com/containerd/containerd/mount"
|
||||||
"github.com/containerd/typeurl/v2"
|
"github.com/containerd/typeurl/v2"
|
||||||
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/device/config"
|
||||||
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
|
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
|
||||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
|
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/annotations"
|
||||||
@ -74,63 +74,6 @@ func copyLayersToMounts(rootFs *vc.RootFs, spec *specs.Spec) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// CDI (Container Device Interface), is a specification, for container- runtimes,
|
|
||||||
// to support third-party devices.
|
|
||||||
// It introduces an abstract notion of a device as a resource. Such devices are
|
|
||||||
// uniquely specified by a fully-qualified name that is constructed from a
|
|
||||||
// vendor ID, a device class, and a name that is unique per vendor ID-device
|
|
||||||
// class pair.
|
|
||||||
//
|
|
||||||
// vendor.com/class=unique_name
|
|
||||||
//
|
|
||||||
// The combination of vendor ID and device class (vendor.com/class in the
|
|
||||||
// above example) is referred to as the device kind.
|
|
||||||
// CDI concerns itself only with enabling containers to be device aware.
|
|
||||||
// Areas like resource management are explicitly left out of CDI (and are
|
|
||||||
// expected to be handled by the orchestrator). Because of this focus, the CDI
|
|
||||||
// specification is simple to implement and allows great flexibility for
|
|
||||||
// runtimes and orchestrators.
|
|
||||||
func withCDI(annotations map[string]string, cdiSpecDirs []string, spec *specs.Spec) (*specs.Spec, error) {
|
|
||||||
// Add devices from CDI annotations
|
|
||||||
_, devsFromAnnotations, err := cdi.ParseAnnotations(annotations)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse CDI device annotations: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(devsFromAnnotations) == 0 {
|
|
||||||
// No devices found, skip device injection
|
|
||||||
return spec, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var registry cdi.Registry
|
|
||||||
if len(cdiSpecDirs) > 0 {
|
|
||||||
// We can override the directories where to search for CDI specs
|
|
||||||
// if needed, the default is /etc/cdi /var/run/cdi
|
|
||||||
registry = cdi.GetRegistry(cdi.WithSpecDirs(cdiSpecDirs...))
|
|
||||||
} else {
|
|
||||||
registry = cdi.GetRegistry()
|
|
||||||
}
|
|
||||||
|
|
||||||
if err = registry.Refresh(); err != nil {
|
|
||||||
// We don't consider registry refresh failure a fatal error.
|
|
||||||
// For instance, a dynamically generated invalid CDI Spec file for
|
|
||||||
// any particular vendor shouldn't prevent injection of devices of
|
|
||||||
// different vendors. CDI itself knows better and it will fail the
|
|
||||||
// injection if necessary.
|
|
||||||
return nil, fmt.Errorf("CDI registry refresh failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, err := registry.InjectDevices(spec, devsFromAnnotations...); err != nil {
|
|
||||||
return nil, fmt.Errorf("CDI device injection failed: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// One crucial thing to keep in mind is that CDI device injection
|
|
||||||
// might add OCI Spec environment variables, hooks, and mounts as
|
|
||||||
// well. Therefore it is important that none of the corresponding
|
|
||||||
// OCI Spec fields are reset up in the call stack once we return.
|
|
||||||
return spec, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) {
|
func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*container, error) {
|
||||||
rootFs := vc.RootFs{}
|
rootFs := vc.RootFs{}
|
||||||
if len(r.Rootfs) == 1 {
|
if len(r.Rootfs) == 1 {
|
||||||
@ -175,9 +118,13 @@ func create(ctx context.Context, s *service, r *taskAPI.CreateTaskRequest) (*con
|
|||||||
//
|
//
|
||||||
// _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec)
|
// _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec)
|
||||||
//
|
//
|
||||||
_, err = withCDI(ociSpec.Annotations, []string{}, ociSpec)
|
// Only inject CDI devices if single_container we do not want
|
||||||
if err != nil {
|
// CDI devices in the pod_sandbox
|
||||||
return nil, fmt.Errorf("adding CDI devices failed")
|
if containerType == vc.SingleContainer {
|
||||||
|
_, err = config.WithCDI(ociSpec.Annotations, []string{}, ociSpec)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("adding CDI devices failed")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s.config = runtimeConfig
|
s.config = runtimeConfig
|
||||||
|
@ -13,8 +13,10 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||||
"github.com/go-ini/ini"
|
"github.com/go-ini/ini"
|
||||||
vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
|
||||||
|
"github.com/opencontainers/runtime-spec/specs-go"
|
||||||
"golang.org/x/sys/unix"
|
"golang.org/x/sys/unix"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -642,3 +644,60 @@ type DeviceState struct {
|
|||||||
// or hot plugged (false).
|
// or hot plugged (false).
|
||||||
ColdPlug bool
|
ColdPlug bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CDI (Container Device Interface), is a specification, for container- runtimes,
|
||||||
|
// to support third-party devices.
|
||||||
|
// It introduces an abstract notion of a device as a resource. Such devices are
|
||||||
|
// uniquely specified by a fully-qualified name that is constructed from a
|
||||||
|
// vendor ID, a device class, and a name that is unique per vendor ID-device
|
||||||
|
// class pair.
|
||||||
|
//
|
||||||
|
// vendor.com/class=unique_name
|
||||||
|
//
|
||||||
|
// The combination of vendor ID and device class (vendor.com/class in the
|
||||||
|
// above example) is referred to as the device kind.
|
||||||
|
// CDI concerns itself only with enabling containers to be device aware.
|
||||||
|
// Areas like resource management are explicitly left out of CDI (and are
|
||||||
|
// expected to be handled by the orchestrator). Because of this focus, the CDI
|
||||||
|
// specification is simple to implement and allows great flexibility for
|
||||||
|
// runtimes and orchestrators.
|
||||||
|
func WithCDI(annotations map[string]string, cdiSpecDirs []string, spec *specs.Spec) (*specs.Spec, error) {
|
||||||
|
// Add devices from CDI annotations
|
||||||
|
_, devsFromAnnotations, err := cdi.ParseAnnotations(annotations)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse CDI device annotations: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(devsFromAnnotations) == 0 {
|
||||||
|
// No devices found, skip device injection
|
||||||
|
return spec, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var registry cdi.Registry
|
||||||
|
if len(cdiSpecDirs) > 0 {
|
||||||
|
// We can override the directories where to search for CDI specs
|
||||||
|
// if needed, the default is /etc/cdi /var/run/cdi
|
||||||
|
registry = cdi.GetRegistry(cdi.WithSpecDirs(cdiSpecDirs...))
|
||||||
|
} else {
|
||||||
|
registry = cdi.GetRegistry()
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = registry.Refresh(); err != nil {
|
||||||
|
// We don't consider registry refresh failure a fatal error.
|
||||||
|
// For instance, a dynamically generated invalid CDI Spec file for
|
||||||
|
// any particular vendor shouldn't prevent injection of devices of
|
||||||
|
// different vendors. CDI itself knows better and it will fail the
|
||||||
|
// injection if necessary.
|
||||||
|
return nil, fmt.Errorf("CDI registry refresh failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := registry.InjectDevices(spec, devsFromAnnotations...); err != nil {
|
||||||
|
return nil, fmt.Errorf("CDI device injection failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// One crucial thing to keep in mind is that CDI device injection
|
||||||
|
// might add OCI Spec environment variables, hooks, and mounts as
|
||||||
|
// well. Therefore it is important that none of the corresponding
|
||||||
|
// OCI Spec fields are reset up in the call stack once we return.
|
||||||
|
return spec, nil
|
||||||
|
}
|
||||||
|
@ -57,7 +57,7 @@ const (
|
|||||||
|
|
||||||
// the maximum amount of PCI bridges that can be cold plugged in a VM
|
// the maximum amount of PCI bridges that can be cold plugged in a VM
|
||||||
maxPCIBridges uint32 = 5
|
maxPCIBridges uint32 = 5
|
||||||
// For mor info why these values, see:
|
// For more info on why these values were chosen, see:
|
||||||
// https://github.com/kata-containers/kata-containers/blob/main/docs/design/kata-vra.md#hypervisor-resource-limits
|
// https://github.com/kata-containers/kata-containers/blob/main/docs/design/kata-vra.md#hypervisor-resource-limits
|
||||||
maxPCIeRootPorts uint32 = 16
|
maxPCIeRootPorts uint32 = 16
|
||||||
maxPCIeSwitchPorts uint32 = 16
|
maxPCIeSwitchPorts uint32 = 16
|
||||||
|
@ -837,9 +837,9 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error {
|
|||||||
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
||||||
// the correct amount of ports to reserve for the hypervisor.
|
// the correct amount of ports to reserve for the hypervisor.
|
||||||
hotPlugVFIO := (c.sandbox.config.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
hotPlugVFIO := (c.sandbox.config.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
||||||
modeIsVFIO := (c.sandbox.config.VfioMode == config.VFIOModeVFIO)
|
|
||||||
|
|
||||||
updatedDeviceInfos := []config.DeviceInfo{}
|
hotPlugDevices := []config.DeviceInfo{}
|
||||||
|
coldPlugDevices := []config.DeviceInfo{}
|
||||||
|
|
||||||
for i, vfio := range deviceInfos {
|
for i, vfio := range deviceInfos {
|
||||||
// Only considering VFIO updates for Port and ColdPlug or
|
// Only considering VFIO updates for Port and ColdPlug or
|
||||||
@ -848,28 +848,30 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error {
|
|||||||
if hotPlugVFIO && isVFIODevice {
|
if hotPlugVFIO && isVFIODevice {
|
||||||
deviceInfos[i].ColdPlug = false
|
deviceInfos[i].ColdPlug = false
|
||||||
deviceInfos[i].Port = c.sandbox.config.HypervisorConfig.HotPlugVFIO
|
deviceInfos[i].Port = c.sandbox.config.HypervisorConfig.HotPlugVFIO
|
||||||
|
hotPlugDevices = append(hotPlugDevices, deviceInfos[i])
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
// Device is already cold-plugged at sandbox creation time
|
// Device is already cold-plugged at sandbox creation time
|
||||||
// ignore it for the container creation
|
// ignore it for the container creation
|
||||||
if coldPlugVFIO && isVFIODevice {
|
if coldPlugVFIO && isVFIODevice {
|
||||||
|
coldPlugDevices = append(coldPlugDevices, deviceInfos[i])
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
hotPlugDevices = append(hotPlugDevices, deviceInfos[i])
|
||||||
updatedDeviceInfos = append(updatedDeviceInfos, deviceInfos[i])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// If modeVFIO is enabled we need 1st to attach the VFIO control group
|
// If modeVFIO is enabled we need 1st to attach the VFIO control group
|
||||||
// device /dev/vfio/vfio an 2nd the actuall device(s) afterwards.
|
// device /dev/vfio/vfio an 2nd the actuall device(s) afterwards.
|
||||||
// Sort the devices starting with device #1 being the VFIO control group
|
// Sort the devices starting with device #1 being the VFIO control group
|
||||||
// device and the next the actuall device(s) /dev/vfio/<group>
|
// device and the next the actuall device(s) /dev/vfio/<group>
|
||||||
if modeIsVFIO {
|
deviceInfos = sortContainerVFIODevices(hotPlugDevices)
|
||||||
deviceInfos = sortContainerVFIODevices(updatedDeviceInfos)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, info := range deviceInfos {
|
for _, info := range deviceInfos {
|
||||||
dev, err := c.sandbox.devManager.NewDevice(info)
|
dev, err := c.sandbox.devManager.NewDevice(info)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
storedDevices = append(storedDevices, ContainerDevice{
|
storedDevices = append(storedDevices, ContainerDevice{
|
||||||
ID: dev.DeviceID(),
|
ID: dev.DeviceID(),
|
||||||
ContainerPath: info.ContainerPath,
|
ContainerPath: info.ContainerPath,
|
||||||
@ -879,6 +881,11 @@ func (c *Container) createDevices(contConfig *ContainerConfig) error {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
c.devices = filterDevices(c, storedDevices)
|
c.devices = filterDevices(c, storedDevices)
|
||||||
|
|
||||||
|
// If we're hot-plugging this will be a no-op because at this stage
|
||||||
|
// no devices are attached to the root-port or switch-port
|
||||||
|
c.annotateContainerWithVFIOMetadata(coldPlugDevices)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -936,24 +943,24 @@ func sortContainerVFIODevices(devices []config.DeviceInfo) []config.DeviceInfo {
|
|||||||
return vfioDevices
|
return vfioDevices
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type DeviceRelation struct {
|
||||||
|
Bus string
|
||||||
|
Path string
|
||||||
|
Index int
|
||||||
|
}
|
||||||
|
|
||||||
// Depending on the HW we might need to inject metadata into the container
|
// Depending on the HW we might need to inject metadata into the container
|
||||||
// In this case for the NV GPU we need to provide the correct mapping from
|
// In this case for the NV GPU we need to provide the correct mapping from
|
||||||
// VFIO-<NUM> to GPU index inside of the VM when vfio_mode="guest-kernel",
|
// VFIO-<NUM> to GPU index inside of the VM when vfio_mode="guest-kernel",
|
||||||
// otherwise we do not know which GPU is which.
|
// otherwise we do not know which GPU is which.
|
||||||
func (c *Container) annotateContainerWithVFIOMetadata() {
|
func (c *Container) annotateContainerWithVFIOMetadata(devices interface{}) {
|
||||||
|
|
||||||
type relation struct {
|
|
||||||
Bus string
|
|
||||||
Path string
|
|
||||||
Index int
|
|
||||||
}
|
|
||||||
|
|
||||||
modeIsGK := (c.sandbox.config.VfioMode == config.VFIOModeGuestKernel)
|
modeIsGK := (c.sandbox.config.VfioMode == config.VFIOModeGuestKernel)
|
||||||
|
|
||||||
if modeIsGK {
|
if modeIsGK {
|
||||||
// Hot plug is done let's update meta information about the
|
// Hot plug is done let's update meta information about the
|
||||||
// hot plugged devices especially VFIO devices in modeIsGK
|
// hot plugged devices especially VFIO devices in modeIsGK
|
||||||
siblings := make([]relation, 0)
|
siblings := make([]DeviceRelation, 0)
|
||||||
// In the sandbox we first create the root-ports and secondly
|
// In the sandbox we first create the root-ports and secondly
|
||||||
// the switch-ports. The range over map is not deterministic
|
// the switch-ports. The range over map is not deterministic
|
||||||
// so lets first iterate over all root-port devices and then
|
// so lets first iterate over all root-port devices and then
|
||||||
@ -961,13 +968,13 @@ func (c *Container) annotateContainerWithVFIOMetadata() {
|
|||||||
for _, dev := range config.PCIeDevicesPerPort["root-port"] {
|
for _, dev := range config.PCIeDevicesPerPort["root-port"] {
|
||||||
// For the NV GPU we need special handling let's use only those
|
// For the NV GPU we need special handling let's use only those
|
||||||
if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") {
|
if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") {
|
||||||
siblings = append(siblings, relation{Bus: dev.Bus, Path: dev.HostPath})
|
siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, dev := range config.PCIeDevicesPerPort["switch-port"] {
|
for _, dev := range config.PCIeDevicesPerPort["switch-port"] {
|
||||||
// For the NV GPU we need special handling let's use only those
|
// For the NV GPU we need special handling let's use only those
|
||||||
if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") {
|
if dev.VendorID == "0x10de" && strings.Contains(dev.Class, "0x030") {
|
||||||
siblings = append(siblings, relation{Bus: dev.Bus, Path: dev.HostPath})
|
siblings = append(siblings, DeviceRelation{Bus: dev.Bus, Path: dev.HostPath})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// We need to sort the VFIO devices by bus to get the correct
|
// We need to sort the VFIO devices by bus to get the correct
|
||||||
@ -979,20 +986,33 @@ func (c *Container) annotateContainerWithVFIOMetadata() {
|
|||||||
for i := range siblings {
|
for i := range siblings {
|
||||||
siblings[i].Index = i
|
siblings[i].Index = i
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now that we have the index lets connect the /dev/vfio/<num>
|
// Now that we have the index lets connect the /dev/vfio/<num>
|
||||||
// to the correct index
|
// to the correct index
|
||||||
for _, dev := range c.devices {
|
if devices, ok := devices.([]ContainerDevice); ok {
|
||||||
for _, bdf := range siblings {
|
for _, dev := range devices {
|
||||||
if bdf.Path == dev.ContainerPath {
|
c.siblingAnnotation(dev.ContainerPath, siblings)
|
||||||
vfioNum := filepath.Base(dev.ContainerPath)
|
|
||||||
annoKey := fmt.Sprintf("cdi.k8s.io/vfio%s", vfioNum)
|
|
||||||
annoValue := fmt.Sprintf("nvidia.com/gpu=%d", bdf.Index)
|
|
||||||
c.config.CustomSpec.Annotations[annoKey] = annoValue
|
|
||||||
c.Logger().Infof("Annotated container with %s: %s", annoKey, annoValue)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if devices, ok := devices.([]config.DeviceInfo); ok {
|
||||||
|
for _, dev := range devices {
|
||||||
|
c.siblingAnnotation(dev.ContainerPath, siblings)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
func (c *Container) siblingAnnotation(devPath string, siblings []DeviceRelation) {
|
||||||
|
for _, sibling := range siblings {
|
||||||
|
if sibling.Path == devPath {
|
||||||
|
vfioNum := filepath.Base(devPath)
|
||||||
|
annoKey := fmt.Sprintf("cdi.k8s.io/vfio%s", vfioNum)
|
||||||
|
annoValue := fmt.Sprintf("nvidia.com/gpu=%d", sibling.Index)
|
||||||
|
c.config.CustomSpec.Annotations[annoKey] = annoValue
|
||||||
|
c.Logger().Infof("annotated container with %s: %s", annoKey, annoValue)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1022,7 +1042,7 @@ func (c *Container) create(ctx context.Context) (err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
c.annotateContainerWithVFIOMetadata()
|
c.annotateContainerWithVFIOMetadata(c.devices)
|
||||||
|
|
||||||
// Deduce additional system mount info that should be handled by the agent
|
// Deduce additional system mount info that should be handled by the agent
|
||||||
// inside the VM
|
// inside the VM
|
||||||
|
@ -672,64 +672,6 @@ func newSandbox(ctx context.Context, sandboxConfig SandboxConfig, factory Factor
|
|||||||
return s, nil
|
return s, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) {
|
|
||||||
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
|
|
||||||
// until we have TDISP/IDE PCIe support.
|
|
||||||
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
|
|
||||||
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
|
||||||
// the correct amount of ports to reserve for the hypervisor.
|
|
||||||
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
|
||||||
|
|
||||||
modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel)
|
|
||||||
// modeIsVFIO is needed at the container level not the sandbox level.
|
|
||||||
// modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO)
|
|
||||||
|
|
||||||
var vfioDevices []config.DeviceInfo
|
|
||||||
// vhost-user-block device is a PCIe device in Virt, keep track of it
|
|
||||||
// for correct number of PCIe root ports.
|
|
||||||
var vhostUserBlkDevices []config.DeviceInfo
|
|
||||||
|
|
||||||
for cnt, containers := range sandboxConfig.Containers {
|
|
||||||
for dev, device := range containers.DeviceInfos {
|
|
||||||
|
|
||||||
if deviceManager.IsVhostUserBlk(device) {
|
|
||||||
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath)
|
|
||||||
if hotPlugVFIO && isVFIODevice {
|
|
||||||
device.ColdPlug = false
|
|
||||||
device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
|
|
||||||
vfioDevices = append(vfioDevices, device)
|
|
||||||
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
|
|
||||||
}
|
|
||||||
if coldPlugVFIO && isVFIODevice {
|
|
||||||
device.ColdPlug = true
|
|
||||||
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
|
|
||||||
vfioDevices = append(vfioDevices, device)
|
|
||||||
// We need to remove the devices marked for cold-plug
|
|
||||||
// otherwise at the container level the kata-agent
|
|
||||||
// will try to hot-plug them.
|
|
||||||
if modeIsGK {
|
|
||||||
sandboxConfig.Containers[cnt].DeviceInfos[dev].ID = "remove-we-are-cold-plugging"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
var filteredDevices []config.DeviceInfo
|
|
||||||
for _, device := range containers.DeviceInfos {
|
|
||||||
if device.ID != "remove-we-are-cold-plugging" {
|
|
||||||
filteredDevices = append(filteredDevices, device)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sandboxConfig.Containers[cnt].DeviceInfos = filteredDevices
|
|
||||||
}
|
|
||||||
|
|
||||||
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
|
|
||||||
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
|
|
||||||
|
|
||||||
return coldPlugVFIO, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func setHypervisorConfigAnnotations(sandboxConfig *SandboxConfig) {
|
func setHypervisorConfigAnnotations(sandboxConfig *SandboxConfig) {
|
||||||
if len(sandboxConfig.Containers) > 0 {
|
if len(sandboxConfig.Containers) > 0 {
|
||||||
// These values are required by remote hypervisor
|
// These values are required by remote hypervisor
|
||||||
@ -747,6 +689,125 @@ func setHypervisorConfigAnnotations(sandboxConfig *SandboxConfig) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Sandbox) coldOrHotPlugVFIO(sandboxConfig *SandboxConfig) (bool, error) {
|
||||||
|
// If we have a confidential guest we need to cold-plug the PCIe VFIO devices
|
||||||
|
// until we have TDISP/IDE PCIe support.
|
||||||
|
coldPlugVFIO := (sandboxConfig.HypervisorConfig.ColdPlugVFIO != config.NoPort)
|
||||||
|
// Aggregate all the containner devices for hot-plug and use them to dedcue
|
||||||
|
// the correct amount of ports to reserve for the hypervisor.
|
||||||
|
hotPlugVFIO := (sandboxConfig.HypervisorConfig.HotPlugVFIO != config.NoPort)
|
||||||
|
|
||||||
|
//modeIsGK := (sandboxConfig.VfioMode == config.VFIOModeGuestKernel)
|
||||||
|
// modeIsVFIO is needed at the container level not the sandbox level.
|
||||||
|
// modeIsVFIO := (sandboxConfig.VfioMode == config.VFIOModeVFIO)
|
||||||
|
|
||||||
|
var vfioDevices []config.DeviceInfo
|
||||||
|
// vhost-user-block device is a PCIe device in Virt, keep track of it
|
||||||
|
// for correct number of PCIe root ports.
|
||||||
|
var vhostUserBlkDevices []config.DeviceInfo
|
||||||
|
|
||||||
|
//io.katacontainers.pkg.oci.container_type:pod_sandbox
|
||||||
|
|
||||||
|
for cnt, container := range sandboxConfig.Containers {
|
||||||
|
// Do not alter the original spec, we do not want to inject
|
||||||
|
// CDI devices into the sandbox container, were using the CDI
|
||||||
|
// devices as additional information to determine the number of
|
||||||
|
// PCIe root ports to reserve for the hypervisor.
|
||||||
|
// A single_container type will have the CDI devices injected
|
||||||
|
// only do this if we're a pod_sandbox type.
|
||||||
|
if container.Annotations["io.katacontainers.pkg.oci.container_type"] == "pod_sandbox" && container.CustomSpec != nil {
|
||||||
|
cdiSpec := container.CustomSpec
|
||||||
|
// We can provide additional directories where to search for
|
||||||
|
// CDI specs if needed. immutable OS's only have specific
|
||||||
|
// directories where applications can write too. For instance /opt/cdi
|
||||||
|
//
|
||||||
|
// _, err = withCDI(ociSpec.Annotations, []string{"/opt/cdi"}, ociSpec)
|
||||||
|
//
|
||||||
|
_, err := config.WithCDI(cdiSpec.Annotations, []string{}, cdiSpec)
|
||||||
|
if err != nil {
|
||||||
|
return coldPlugVFIO, fmt.Errorf("adding CDI devices failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dev := range cdiSpec.Linux.Devices {
|
||||||
|
isVFIODevice := deviceManager.IsVFIODevice(dev.Path)
|
||||||
|
if hotPlugVFIO && isVFIODevice {
|
||||||
|
vfioDev := config.DeviceInfo{
|
||||||
|
ColdPlug: true,
|
||||||
|
ContainerPath: dev.Path,
|
||||||
|
Port: sandboxConfig.HypervisorConfig.HotPlugVFIO,
|
||||||
|
DevType: dev.Type,
|
||||||
|
Major: dev.Major,
|
||||||
|
Minor: dev.Minor,
|
||||||
|
}
|
||||||
|
if dev.FileMode != nil {
|
||||||
|
vfioDev.FileMode = *dev.FileMode
|
||||||
|
}
|
||||||
|
if dev.UID != nil {
|
||||||
|
vfioDev.UID = *dev.UID
|
||||||
|
}
|
||||||
|
if dev.GID != nil {
|
||||||
|
vfioDev.GID = *dev.GID
|
||||||
|
}
|
||||||
|
|
||||||
|
vfioDevices = append(vfioDevices, vfioDev)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if coldPlugVFIO && isVFIODevice {
|
||||||
|
vfioDev := config.DeviceInfo{
|
||||||
|
ColdPlug: true,
|
||||||
|
ContainerPath: dev.Path,
|
||||||
|
Port: sandboxConfig.HypervisorConfig.ColdPlugVFIO,
|
||||||
|
DevType: dev.Type,
|
||||||
|
Major: dev.Major,
|
||||||
|
Minor: dev.Minor,
|
||||||
|
}
|
||||||
|
if dev.FileMode != nil {
|
||||||
|
vfioDev.FileMode = *dev.FileMode
|
||||||
|
}
|
||||||
|
if dev.UID != nil {
|
||||||
|
vfioDev.UID = *dev.UID
|
||||||
|
}
|
||||||
|
if dev.GID != nil {
|
||||||
|
vfioDev.GID = *dev.GID
|
||||||
|
}
|
||||||
|
|
||||||
|
vfioDevices = append(vfioDevices, vfioDev)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// As stated before the single_container will have the CDI
|
||||||
|
// devices injected by the runtime. For the pod_container use-case
|
||||||
|
// see container.go how cold and hot-plug are handled.
|
||||||
|
for dev, device := range container.DeviceInfos {
|
||||||
|
if deviceManager.IsVhostUserBlk(device) {
|
||||||
|
vhostUserBlkDevices = append(vhostUserBlkDevices, device)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
isVFIODevice := deviceManager.IsVFIODevice(device.ContainerPath)
|
||||||
|
if hotPlugVFIO && isVFIODevice {
|
||||||
|
device.ColdPlug = false
|
||||||
|
device.Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
|
||||||
|
vfioDevices = append(vfioDevices, device)
|
||||||
|
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.HotPlugVFIO
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if coldPlugVFIO && isVFIODevice {
|
||||||
|
device.ColdPlug = true
|
||||||
|
device.Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
|
||||||
|
vfioDevices = append(vfioDevices, device)
|
||||||
|
sandboxConfig.Containers[cnt].DeviceInfos[dev].Port = sandboxConfig.HypervisorConfig.ColdPlugVFIO
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sandboxConfig.HypervisorConfig.VFIODevices = vfioDevices
|
||||||
|
sandboxConfig.HypervisorConfig.VhostUserBlkDevices = vhostUserBlkDevices
|
||||||
|
|
||||||
|
return coldPlugVFIO, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Sandbox) createResourceController() error {
|
func (s *Sandbox) createResourceController() error {
|
||||||
var err error
|
var err error
|
||||||
cgroupPath := ""
|
cgroupPath := ""
|
||||||
|
Loading…
Reference in New Issue
Block a user