diff --git a/src/runtime/virtcontainers/endpoint.go b/src/runtime/virtcontainers/endpoint.go index ef50a8eb7..d65cc8cf6 100644 --- a/src/runtime/virtcontainers/endpoint.go +++ b/src/runtime/virtcontainers/endpoint.go @@ -65,6 +65,13 @@ const ( // IPVlanEndpointType is ipvlan network interface. IPVlanEndpointType EndpointType = "ipvlan" + + // VfioEndpointType is a VFIO device that will be claimed as a network interface + // in the guest VM. Unlike PhysicalEndpointType, which requires a VF network interface + // with its network configured on the host before creating the sandbox, VfioEndpointType + // does not need a host network interface and instead has its network network configured + // through DAN. + VfioEndpointType EndpointType = "vfio" ) // Set sets an endpoint type based on the input string. @@ -94,6 +101,9 @@ func (endpointType *EndpointType) Set(value string) error { case "ipvlan": *endpointType = IPVlanEndpointType return nil + case "vfio": + *endpointType = VfioEndpointType + return nil default: return fmt.Errorf("Unknown endpoint type %s", value) } @@ -118,6 +128,8 @@ func (endpointType *EndpointType) String() string { return string(TuntapEndpointType) case IPVlanEndpointType: return string(IPVlanEndpointType) + case VfioEndpointType: + return string(VfioEndpointType) default: return "" } diff --git a/src/runtime/virtcontainers/endpoint_test.go b/src/runtime/virtcontainers/endpoint_test.go index 36d1294be..c3949f6fd 100644 --- a/src/runtime/virtcontainers/endpoint_test.go +++ b/src/runtime/virtcontainers/endpoint_test.go @@ -42,6 +42,10 @@ func TestMacvtapEndpointTypeSet(t *testing.T) { testEndpointTypeSet(t, "macvtap", MacvtapEndpointType) } +func TestVfioEndpointTypeSet(t *testing.T) { + testEndpointTypeSet(t, "vfio", VfioEndpointType) +} + func TestEndpointTypeSetFailure(t *testing.T) { var endpointType EndpointType diff --git a/src/runtime/virtcontainers/kata_agent.go b/src/runtime/virtcontainers/kata_agent.go index 6f60dc997..9a794392b 100644 --- a/src/runtime/virtcontainers/kata_agent.go +++ b/src/runtime/virtcontainers/kata_agent.go @@ -809,20 +809,10 @@ func (k *kataAgent) startSandbox(ctx context.Context, sandbox *Sandbox) error { if sandbox.config.HypervisorType != RemoteHypervisor { // Setup network interfaces and routes - interfaces, routes, neighs, err := generateVCNetworkStructures(ctx, sandbox.network) + err = k.setupNetworks(ctx, sandbox, nil) if err != nil { return err } - if err = k.updateInterfaces(ctx, interfaces); err != nil { - return err - } - if _, err = k.updateRoutes(ctx, routes); err != nil { - return err - } - if err = k.addARPNeighbors(ctx, neighs); err != nil { - return err - } - kmodules = setupKernelModules(k.kmodules) } @@ -1282,6 +1272,67 @@ func (k *kataAgent) rollbackFailingContainerCreation(ctx context.Context, c *Con } } +func (k *kataAgent) setupNetworks(ctx context.Context, sandbox *Sandbox, c *Container) error { + if sandbox.network.NetworkID() == "" { + return nil + } + + var err error + var endpoints []Endpoint + if c == nil || c.id == sandbox.id { + // TODO: VFIO network deivce has not been hotplugged when creating the Sandbox, + // so need to skip VFIO endpoint here. + // After KEP #4113(https://github.com/kubernetes/enhancements/pull/4113) + // is implemented, the VFIO network devices will be attached before container + // creation, so no need to skip them here anymore. + for _, ep := range sandbox.network.Endpoints() { + if ep.Type() != VfioEndpointType { + endpoints = append(endpoints, ep) + } + } + } else if !sandbox.hotplugNetworkConfigApplied { + // Apply VFIO network devices' configuration after they are hot-plugged. + for _, ep := range sandbox.network.Endpoints() { + if ep.Type() == VfioEndpointType { + hostBDF := ep.(*VfioEndpoint).HostBDF + pciPath := sandbox.GetVfioDeviceGuestPciPath(hostBDF) + if pciPath.IsNil() { + return fmt.Errorf("PCI path for VFIO interface '%s' not found", ep.Name()) + } + ep.SetPciPath(pciPath) + endpoints = append(endpoints, ep) + } + } + + defer func() { + if err == nil { + sandbox.hotplugNetworkConfigApplied = true + } + }() + } + + if len(endpoints) == 0 { + return nil + } + + interfaces, routes, neighs, err := generateVCNetworkStructures(ctx, endpoints) + if err != nil { + return err + } + + if err = k.updateInterfaces(ctx, interfaces); err != nil { + return err + } + if _, err = k.updateRoutes(ctx, routes); err != nil { + return err + } + if err = k.addARPNeighbors(ctx, neighs); err != nil { + return err + } + + return nil +} + func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Container) (p *Process, err error) { span, ctx := katatrace.Trace(ctx, k.Logger(), "createContainer", kataAgentTracingTags) defer span.End() @@ -1429,6 +1480,11 @@ func (k *kataAgent) createContainer(ctx context.Context, sandbox *Sandbox, c *Co } return nil, err } + + if err = k.setupNetworks(ctx, sandbox, c); err != nil { + return nil, err + } + return buildProcessFromExecID(req.ExecId) } diff --git a/src/runtime/virtcontainers/network.go b/src/runtime/virtcontainers/network.go index 15b021a51..42b456cc6 100644 --- a/src/runtime/virtcontainers/network.go +++ b/src/runtime/virtcontainers/network.go @@ -233,10 +233,8 @@ type Network interface { GetEndpointsNum() (int, error) } -func generateVCNetworkStructures(ctx context.Context, network Network) ([]*pbTypes.Interface, []*pbTypes.Route, []*pbTypes.ARPNeighbor, error) { - if network.NetworkID() == "" { - return nil, nil, nil, nil - } +func generateVCNetworkStructures(ctx context.Context, endpoints []Endpoint) ([]*pbTypes.Interface, []*pbTypes.Route, []*pbTypes.ARPNeighbor, error) { + span, _ := networkTrace(ctx, "generateVCNetworkStructures", nil) defer span.End() @@ -244,7 +242,7 @@ func generateVCNetworkStructures(ctx context.Context, network Network) ([]*pbTyp var ifaces []*pbTypes.Interface var neighs []*pbTypes.ARPNeighbor - for _, endpoint := range network.Endpoints() { + for _, endpoint := range endpoints { var ipAddresses []*pbTypes.IPAddress for _, addr := range endpoint.Properties().Addrs { // Skip localhost interface @@ -270,6 +268,7 @@ func generateVCNetworkStructures(ctx context.Context, network Network) ([]*pbTyp Device: endpoint.Name(), Name: endpoint.Name(), Mtu: uint64(endpoint.Properties().Iface.MTU), + Type: string(endpoint.Type()), RawFlags: noarp, HwAddr: endpoint.HardwareAddr(), PciPath: endpoint.PciPath().String(), diff --git a/src/runtime/virtcontainers/network_linux.go b/src/runtime/virtcontainers/network_linux.go index e5d152a50..3f4b419d2 100644 --- a/src/runtime/virtcontainers/network_linux.go +++ b/src/runtime/virtcontainers/network_linux.go @@ -442,6 +442,10 @@ func convertDanDeviceToNetworkInfo(device *vctypes.DanDevice) (*NetworkInfo, err // Load network config in DAN config // Create the endpoints for the interfaces in Dan. func (n *LinuxNetwork) addDanEndpoints() error { + if len(n.eps) > 0 { + // only load DAN config once + return nil + } jsonData, err := os.ReadFile(n.danConfigPath) if err != nil { @@ -458,20 +462,21 @@ func (n *LinuxNetwork) addDanEndpoints() error { var endpoint Endpoint networkLogger().WithField("interface", device.Name).Info("DAN interface found") - _, err := convertDanDeviceToNetworkInfo(&device) + netInfo, err := convertDanDeviceToNetworkInfo(&device) if err != nil { return err } - // TODO: Add endpoints that are supported via DAN switch device.Device.Type { + case vctypes.VfioDanDeviceType: + endpoint, err = createVfioEndpoint(device.Device.PciDeviceID, netInfo) + if err != nil { + return err + } default: return fmt.Errorf("unknown DAN device type: '%s'", device.Device.Type) } - // TODO: remove below `nolink` directive after one `case` is added for - // above `switch` block. - //nolint: govet n.eps = append(n.eps, endpoint) } diff --git a/src/runtime/virtcontainers/network_linux_test.go b/src/runtime/virtcontainers/network_linux_test.go index d959f3f28..4ab36e04e 100644 --- a/src/runtime/virtcontainers/network_linux_test.go +++ b/src/runtime/virtcontainers/network_linux_test.go @@ -81,7 +81,7 @@ func TestGenerateInterfacesAndRoutes(t *testing.T) { assert.Nil(t, err) nns.SetEndpoints(endpoints) - resInterfaces, resRoutes, resNeighs, err := generateVCNetworkStructures(context.Background(), nns) + resInterfaces, resRoutes, resNeighs, err := generateVCNetworkStructures(context.Background(), nns.Endpoints()) // // Build expected results: @@ -371,7 +371,13 @@ func TestAddEndpoints_Dan(t *testing.T) { } ctx := context.TODO() - _, err := network.AddEndpoints(ctx, nil, nil, true) - // TODO: this will be updated after adding supported DAN device - assert.ErrorContains(t, err, "unknown DAN device type") + eps, err := network.AddEndpoints(ctx, nil, nil, true) + assert.NoError(t, err) + assert.Len(t, eps, 1) + + ep := eps[0] + assert.Equal(t, ep.Name(), "eth0") + assert.Equal(t, ep.HardwareAddr(), "0a:58:0a:0a:00:05") + assert.Equal(t, ep.Type(), VfioEndpointType) + assert.Equal(t, ep.PciPath().String(), "") } diff --git a/src/runtime/virtcontainers/persist/api/network.go b/src/runtime/virtcontainers/persist/api/network.go index a642fa578..256ce807b 100644 --- a/src/runtime/virtcontainers/persist/api/network.go +++ b/src/runtime/virtcontainers/persist/api/network.go @@ -79,6 +79,10 @@ type VhostUserEndpoint struct { PCIPath vcTypes.PciPath } +type VfioEndpoint struct { + IfaceName string +} + // NetworkEndpoint contains network interface information type NetworkEndpoint struct { // One and only one of these below are not nil according to Type. @@ -90,6 +94,7 @@ type NetworkEndpoint struct { Tap *TapEndpoint `json:",omitempty"` IPVlan *IPVlanEndpoint `json:",omitempty"` Tuntap *TuntapEndpoint `json:",omitempty"` + Vfio *VfioEndpoint `json:",omitempty"` Type string } diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 8df2b7cc5..ac0d35e9c 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -248,6 +248,11 @@ type Sandbox struct { seccompSupported bool disableVMShutdown bool isVCPUsPinningOn bool + + // hotplugNetworkConfigApplied prevents network config API being called + // multiple times for hot-plugged network device when Sandbox has multiple + // containers. + hotplugNetworkConfigApplied bool } // ID returns the sandbox identifier string. @@ -2247,6 +2252,29 @@ func (s *Sandbox) AddDevice(ctx context.Context, info config.DeviceInfo) (api.De return add, nil } +// GetVfioDeviceGuestPciPath return a device's guest PCI path by its host BDF +func (s *Sandbox) GetVfioDeviceGuestPciPath(hostBDF string) types.PciPath { + devices := s.devManager.GetAllDevices() + for _, device := range devices { + switch device.DeviceType() { + case config.DeviceVFIO: + vfioDevices, ok := device.GetDeviceInfo().([]*config.VFIODev) + if !ok { + continue + } + for _, vfioDev := range vfioDevices { + if vfioDev.BDF == hostBDF { + return vfioDev.GuestPciPath + } + } + default: + continue + } + } + + return types.PciPath{} +} + // updateResources will: // - calculate the resources required for the virtual machine, and adjust the virtual machine // sizing accordingly. For a given sandbox, it will calculate the number of vCPUs required based diff --git a/src/runtime/virtcontainers/types/dan.go b/src/runtime/virtcontainers/types/dan.go index 073977f0c..2520779b2 100644 --- a/src/runtime/virtcontainers/types/dan.go +++ b/src/runtime/virtcontainers/types/dan.go @@ -20,6 +20,10 @@ type DanDevice struct { // DanDeviceType identifies the type of the network interface. type DanDeviceType string +const ( + VfioDanDeviceType DanDeviceType = "vfio" +) + type Device struct { Type DanDeviceType `json:"type"` Path string `json:"path,omitempty"` diff --git a/src/runtime/virtcontainers/vfio_endpoint.go b/src/runtime/virtcontainers/vfio_endpoint.go new file mode 100644 index 000000000..f21c6ecf9 --- /dev/null +++ b/src/runtime/virtcontainers/vfio_endpoint.go @@ -0,0 +1,130 @@ +// Copyright (c) 2024 NVIDIA Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import ( + "context" + "fmt" + + persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api" + vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" +) + +// VfioEndpoint represents a VFIO endpoint which claimed by guest kernel +type VfioEndpoint struct { + EndpointType EndpointType + HostBDF string + PCIPath vcTypes.PciPath + Iface NetworkInterface + EndpointProperties NetworkInfo +} + +// Implements Endpoint interface + +// Properties returns the properties of the interface. +func (endpoint *VfioEndpoint) Properties() NetworkInfo { + return endpoint.EndpointProperties +} + +// Name returns name of the interface. +func (endpoint *VfioEndpoint) Name() string { + return endpoint.Iface.Name +} + +// HardwareAddr returns the mac address of the network interface +func (endpoint *VfioEndpoint) HardwareAddr() string { + return endpoint.Iface.HardAddr +} + +// Type indentifies the endpoint as a vfio endpoint. +func (endpoint *VfioEndpoint) Type() EndpointType { + return endpoint.EndpointType +} + +// PciPath returns the PCI path of the endpoint. +func (endpoint *VfioEndpoint) PciPath() vcTypes.PciPath { + return endpoint.PCIPath +} + +// NetworkPair always return nil +func (endpoint *VfioEndpoint) NetworkPair() *NetworkInterfacePair { + return nil +} + +// SetProperties sets the properties of the endpoint. +func (endpoint *VfioEndpoint) SetProperties(info NetworkInfo) { + endpoint.EndpointProperties = info +} + +// SetPciPath sets the PCI path of the endpoint. +func (endpoint *VfioEndpoint) SetPciPath(path vcTypes.PciPath) { + endpoint.PCIPath = path +} + +// Attach for VFIO endpoint +func (endpoint *VfioEndpoint) Attach(ctx context.Context, s *Sandbox) error { + return fmt.Errorf("attach is unsupported for VFIO endpoint") +} + +// Detach for VFIO endpoint +func (endpoint *VfioEndpoint) Detach(ctx context.Context, netNsCreated bool, netNsPath string) error { + return fmt.Errorf("detach is unsupported for VFIO endpoint") +} + +func (endpoint *VfioEndpoint) HotAttach(context.Context, *Sandbox) error { + return fmt.Errorf("VfioEndpoint does not support Hot attach") +} + +func (endpoint *VfioEndpoint) HotDetach(ctx context.Context, s *Sandbox, netNsCreated bool, netNsPath string) error { + return fmt.Errorf("VfioEndpoint does not support Hot detach") +} + +func (endpoint *VfioEndpoint) save() persistapi.NetworkEndpoint { + return persistapi.NetworkEndpoint{ + Type: string(endpoint.Type()), + Vfio: &persistapi.VfioEndpoint{}, + } +} + +func (endpoint *VfioEndpoint) load(s persistapi.NetworkEndpoint) { + endpoint.EndpointType = VfioEndpointType + + if s.Vfio != nil { + endpoint.Iface.Name = s.Vfio.IfaceName + } +} + +func (endpoint *VfioEndpoint) GetRxRateLimiter() bool { + return false +} + +func (endpoint *VfioEndpoint) SetRxRateLimiter() error { + return fmt.Errorf("rx rate limiter is unsupported for VFIO endpoint") +} + +func (endpoint *VfioEndpoint) GetTxRateLimiter() bool { + return false +} + +func (endpoint *VfioEndpoint) SetTxRateLimiter() error { + return fmt.Errorf("tx rate limiter is unsupported for VFIO endpoint") +} + +// Create a VFIO endpoint +func createVfioEndpoint(hostBDF string, netInfo *NetworkInfo) (*VfioEndpoint, error) { + endpoint := &VfioEndpoint{ + EndpointType: VfioEndpointType, + HostBDF: hostBDF, + Iface: NetworkInterface{ + Name: netInfo.Iface.Name, + HardAddr: netInfo.Iface.HardwareAddr.String(), + Addrs: netInfo.Addrs, + }, + EndpointProperties: *netInfo, + } + + return endpoint, nil +}