// // Copyright (c) 2016 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // package virtcontainers import ( "encoding/hex" "encoding/json" "fmt" "io/ioutil" "math/rand" "net" "os" "path/filepath" "runtime" "strings" "time" "github.com/containernetworking/plugins/pkg/ns" "github.com/kata-containers/runtime/virtcontainers/pkg/uuid" "github.com/safchain/ethtool" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" "golang.org/x/sys/unix" ) // NetInterworkingModel defines the network model connecting // the network interface to the virtual machine. type NetInterworkingModel int const ( // NetXConnectDefaultModel Ask to use DefaultNetInterworkingModel NetXConnectDefaultModel NetInterworkingModel = iota // NetXConnectBridgedModel uses a linux bridge to interconnect // the container interface to the VM. This is the // safe default that works for most cases except // macvlan and ipvlan NetXConnectBridgedModel // NetXConnectMacVtapModel can be used when the Container network // interface can be bridged using macvtap NetXConnectMacVtapModel // NetXConnectEnlightenedModel can be used when the Network plugins // are enlightened to create VM native interfaces // when requested by the runtime // This will be used for vethtap, macvtap, ipvtap NetXConnectEnlightenedModel // NetXConnectInvalidModel is the last item to check valid values by IsValid() NetXConnectInvalidModel ) //IsValid checks if a model is valid func (n NetInterworkingModel) IsValid() bool { return 0 <= int(n) && int(n) < int(NetXConnectInvalidModel) } //SetModel change the model string value func (n *NetInterworkingModel) SetModel(modelName string) error { switch modelName { case "default": *n = DefaultNetInterworkingModel return nil case "bridged": *n = NetXConnectBridgedModel return nil case "macvtap": *n = NetXConnectMacVtapModel return nil case "enlightened": *n = NetXConnectEnlightenedModel return nil } return fmt.Errorf("Unknown type %s", modelName) } // DefaultNetInterworkingModel is a package level default // that determines how the VM should be connected to the // the container network interface var DefaultNetInterworkingModel = NetXConnectMacVtapModel // Introduces constants related to networking const ( defaultRouteDest = "0.0.0.0/0" defaultRouteLabel = "default" defaultFilePerms = 0600 defaultQlen = 1500 defaultQueues = 8 ) // DNSInfo describes the DNS setup related to a network interface. type DNSInfo struct { Servers []string Domain string Searches []string Options []string } // NetlinkIface describes fully a network interface. type NetlinkIface struct { netlink.LinkAttrs Type string } // NetworkInfo gathers all information related to a network interface. // It can be used to store the description of the underlying network. type NetworkInfo struct { Iface NetlinkIface Addrs []netlink.Addr Routes []netlink.Route DNS DNSInfo } // NetworkInterface defines a network interface. type NetworkInterface struct { Name string HardAddr string Addrs []netlink.Addr } // NetworkInterfacePair defines a pair between VM and virtual network interfaces. type NetworkInterfacePair struct { ID string Name string VirtIface NetworkInterface TAPIface NetworkInterface NetInterworkingModel VMFds []*os.File VhostFds []*os.File } // NetworkConfig is the network configuration related to a network. type NetworkConfig struct { NetNSPath string NumInterfaces int InterworkingModel NetInterworkingModel } // Endpoint represents a physical or virtual network interface. type Endpoint interface { Properties() NetworkInfo Name() string HardwareAddr() string Type() EndpointType SetProperties(NetworkInfo) Attach(hypervisor) error Detach() error } // VirtualEndpoint gathers a network pair and its properties. type VirtualEndpoint struct { NetPair NetworkInterfacePair EndpointProperties NetworkInfo Physical bool EndpointType EndpointType } // PhysicalEndpoint gathers a physical network interface and its properties type PhysicalEndpoint struct { IfaceName string HardAddr string EndpointProperties NetworkInfo EndpointType EndpointType BDF string Driver string VendorDeviceID string } // VhostUserEndpoint represents a vhost-user socket based network interface type VhostUserEndpoint struct { // Path to the vhost-user socket on the host system SocketPath string // MAC address of the interface HardAddr string IfaceName string EndpointProperties NetworkInfo EndpointType EndpointType } // Properties returns properties for the veth interface in the network pair. func (endpoint *VirtualEndpoint) Properties() NetworkInfo { return endpoint.EndpointProperties } // Name returns name of the veth interface in the network pair. func (endpoint *VirtualEndpoint) Name() string { return endpoint.NetPair.VirtIface.Name } // HardwareAddr returns the mac address that is assigned to the tap interface // in th network pair. func (endpoint *VirtualEndpoint) HardwareAddr() string { return endpoint.NetPair.TAPIface.HardAddr } // Type identifies the endpoint as a virtual endpoint. func (endpoint *VirtualEndpoint) Type() EndpointType { return endpoint.EndpointType } // SetProperties sets the properties for the endpoint. func (endpoint *VirtualEndpoint) SetProperties(properties NetworkInfo) { endpoint.EndpointProperties = properties } func networkLogger() *logrus.Entry { return virtLog.WithField("subsystem", "network") } // Attach for virtual endpoint bridges the network pair and adds the // tap interface of the network pair to the hypervisor. func (endpoint *VirtualEndpoint) Attach(h hypervisor) error { networkLogger().Info("Attaching virtual endpoint") if err := xconnectVMNetwork(&(endpoint.NetPair), true); err != nil { networkLogger().WithError(err).Error("Error bridging virtual ep") return err } return h.addDevice(endpoint, netDev) } // Detach for the virtual endpoint tears down the tap and bridge // created for the veth interface. func (endpoint *VirtualEndpoint) Detach() error { networkLogger().Info("Detaching virtual endpoint") return xconnectVMNetwork(&(endpoint.NetPair), false) } // Properties returns the properties of the interface. func (endpoint *VhostUserEndpoint) Properties() NetworkInfo { return endpoint.EndpointProperties } // Name returns name of the interface. func (endpoint *VhostUserEndpoint) Name() string { return endpoint.IfaceName } // HardwareAddr returns the mac address of the vhostuser network interface func (endpoint *VhostUserEndpoint) HardwareAddr() string { return endpoint.HardAddr } // Type indentifies the endpoint as a vhostuser endpoint. func (endpoint *VhostUserEndpoint) Type() EndpointType { return endpoint.EndpointType } // SetProperties sets the properties of the endpoint. func (endpoint *VhostUserEndpoint) SetProperties(properties NetworkInfo) { endpoint.EndpointProperties = properties } // Attach for vhostuser endpoint func (endpoint *VhostUserEndpoint) Attach(h hypervisor) error { networkLogger().Info("Attaching vhostuser based endpoint") // generate a unique ID to be used for hypervisor commandline fields randBytes, err := generateRandomBytes(8) if err != nil { return err } id := hex.EncodeToString(randBytes) d := VhostUserNetDevice{ MacAddress: endpoint.HardAddr, } d.SocketPath = endpoint.SocketPath d.ID = id return h.addDevice(d, vhostuserDev) } // Detach for vhostuser endpoint func (endpoint *VhostUserEndpoint) Detach() error { networkLogger().Info("Detaching vhostuser based endpoint") return nil } // Create a vhostuser endpoint func createVhostUserEndpoint(netInfo NetworkInfo, socket string) (*VhostUserEndpoint, error) { vhostUserEndpoint := &VhostUserEndpoint{ SocketPath: socket, HardAddr: netInfo.Iface.HardwareAddr.String(), IfaceName: netInfo.Iface.Name, EndpointType: VhostUserEndpointType, } return vhostUserEndpoint, nil } // Properties returns the properties of the physical interface. func (endpoint *PhysicalEndpoint) Properties() NetworkInfo { return endpoint.EndpointProperties } // HardwareAddr returns the mac address of the physical network interface. func (endpoint *PhysicalEndpoint) HardwareAddr() string { return endpoint.HardAddr } // Name returns name of the physical interface. func (endpoint *PhysicalEndpoint) Name() string { return endpoint.IfaceName } // Type indentifies the endpoint as a physical endpoint. func (endpoint *PhysicalEndpoint) Type() EndpointType { return endpoint.EndpointType } // SetProperties sets the properties of the physical endpoint. func (endpoint *PhysicalEndpoint) SetProperties(properties NetworkInfo) { endpoint.EndpointProperties = properties } // Attach for physical endpoint binds the physical network interface to // vfio-pci and adds device to the hypervisor with vfio-passthrough. func (endpoint *PhysicalEndpoint) Attach(h hypervisor) error { networkLogger().Info("Attaching physical endpoint") // Unbind physical interface from host driver and bind to vfio // so that it can be passed to qemu. if err := bindNICToVFIO(endpoint); err != nil { return err } d := VFIODevice{ BDF: endpoint.BDF, } return h.addDevice(d, vfioDev) } // Detach for physical endpoint unbinds the physical network interface from vfio-pci // and binds it back to the saved host driver. func (endpoint *PhysicalEndpoint) Detach() error { // Bind back the physical network interface to host. networkLogger().Info("Detaching physical endpoint") return bindNICToHost(endpoint) } // EndpointType identifies the type of the network endpoint. type EndpointType string const ( // PhysicalEndpointType is the physical network interface. PhysicalEndpointType EndpointType = "physical" // VirtualEndpointType is the virtual network interface. VirtualEndpointType EndpointType = "virtual" // VhostUserEndpointType is the vhostuser network interface. VhostUserEndpointType EndpointType = "vhost-user" ) // Set sets an endpoint type based on the input string. func (endpointType *EndpointType) Set(value string) error { switch value { case "physical": *endpointType = PhysicalEndpointType return nil case "virtual": *endpointType = VirtualEndpointType return nil case "vhost-user": *endpointType = VhostUserEndpointType return nil default: return fmt.Errorf("Unknown endpoint type %s", value) } } // String converts an endpoint type to a string. func (endpointType *EndpointType) String() string { switch *endpointType { case PhysicalEndpointType: return string(PhysicalEndpointType) case VirtualEndpointType: return string(VirtualEndpointType) case VhostUserEndpointType: return string(VhostUserEndpointType) default: return "" } } // NetworkNamespace contains all data related to its network namespace. type NetworkNamespace struct { NetNsPath string NetNsCreated bool Endpoints []Endpoint } // TypedJSONEndpoint is used as an intermediate representation for // marshalling and unmarshalling Endpoint objects. type TypedJSONEndpoint struct { Type EndpointType Data json.RawMessage } // MarshalJSON is the custom NetworkNamespace JSON marshalling routine. // This is needed to properly marshall Endpoints array. func (n NetworkNamespace) MarshalJSON() ([]byte, error) { // We need a shadow structure in order to prevent json from // entering a recursive loop when only calling json.Marshal(). type shadow struct { NetNsPath string NetNsCreated bool Endpoints []TypedJSONEndpoint } s := &shadow{ NetNsPath: n.NetNsPath, NetNsCreated: n.NetNsCreated, } var typedEndpoints []TypedJSONEndpoint for _, endpoint := range n.Endpoints { tempJSON, _ := json.Marshal(endpoint) t := TypedJSONEndpoint{ Type: endpoint.Type(), Data: tempJSON, } typedEndpoints = append(typedEndpoints, t) } s.Endpoints = typedEndpoints b, err := json.Marshal(s) return b, err } // UnmarshalJSON is the custom NetworkNamespace unmarshalling routine. // This is needed for unmarshalling the Endpoints interfaces array. func (n *NetworkNamespace) UnmarshalJSON(b []byte) error { var s struct { NetNsPath string NetNsCreated bool Endpoints json.RawMessage } if err := json.Unmarshal(b, &s); err != nil { return err } (*n).NetNsPath = s.NetNsPath (*n).NetNsCreated = s.NetNsCreated var typedEndpoints []TypedJSONEndpoint if err := json.Unmarshal([]byte(string(s.Endpoints)), &typedEndpoints); err != nil { return err } var endpoints []Endpoint for _, e := range typedEndpoints { switch e.Type { case PhysicalEndpointType: var endpoint PhysicalEndpoint err := json.Unmarshal(e.Data, &endpoint) if err != nil { return err } endpoints = append(endpoints, &endpoint) virtLog.Infof("Physical endpoint unmarshalled [%v]", endpoint) case VirtualEndpointType: var endpoint VirtualEndpoint err := json.Unmarshal(e.Data, &endpoint) if err != nil { return err } endpoints = append(endpoints, &endpoint) virtLog.Infof("Virtual endpoint unmarshalled [%v]", endpoint) case VhostUserEndpointType: var endpoint VhostUserEndpoint err := json.Unmarshal(e.Data, &endpoint) if err != nil { return err } endpoints = append(endpoints, &endpoint) virtLog.Infof("VhostUser endpoint unmarshalled [%v]", endpoint) default: virtLog.Errorf("Unknown endpoint type received %s\n", e.Type) } } (*n).Endpoints = endpoints return nil } // NetworkModel describes the type of network specification. type NetworkModel string const ( // NoopNetworkModel is the No-Op network. NoopNetworkModel NetworkModel = "noop" // CNINetworkModel is the CNI network. CNINetworkModel NetworkModel = "CNI" // CNMNetworkModel is the CNM network. CNMNetworkModel NetworkModel = "CNM" ) // Set sets a network type based on the input string. func (networkType *NetworkModel) Set(value string) error { switch value { case "noop": *networkType = NoopNetworkModel return nil case "CNI": *networkType = CNINetworkModel return nil case "CNM": *networkType = CNMNetworkModel return nil default: return fmt.Errorf("Unknown network type %s", value) } } // String converts a network type to a string. func (networkType *NetworkModel) String() string { switch *networkType { case NoopNetworkModel: return string(NoopNetworkModel) case CNINetworkModel: return string(CNINetworkModel) case CNMNetworkModel: return string(CNMNetworkModel) default: return "" } } // newNetwork returns a network from a network type. func newNetwork(networkType NetworkModel) network { switch networkType { case NoopNetworkModel: return &noopNetwork{} case CNINetworkModel: return &cni{} case CNMNetworkModel: return &cnm{} default: return &noopNetwork{} } } func initNetworkCommon(config NetworkConfig) (string, bool, error) { if !config.InterworkingModel.IsValid() || config.InterworkingModel == NetXConnectDefaultModel { config.InterworkingModel = DefaultNetInterworkingModel } if config.NetNSPath == "" { path, err := createNetNS() if err != nil { return "", false, err } return path, true, nil } return config.NetNSPath, false, nil } func runNetworkCommon(networkNSPath string, cb func() error) error { if networkNSPath == "" { return fmt.Errorf("networkNSPath cannot be empty") } return doNetNS(networkNSPath, func(_ ns.NetNS) error { return cb() }) } func addNetworkCommon(pod Pod, networkNS *NetworkNamespace) error { err := doNetNS(networkNS.NetNsPath, func(_ ns.NetNS) error { for _, endpoint := range networkNS.Endpoints { if err := endpoint.Attach(pod.hypervisor); err != nil { return err } } return nil }) return err } func removeNetworkCommon(networkNS NetworkNamespace) error { return doNetNS(networkNS.NetNsPath, func(_ ns.NetNS) error { for _, endpoint := range networkNS.Endpoints { if err := endpoint.Detach(); err != nil { return err } } return nil }) } func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, []*os.File, error) { var newLink netlink.Link var fds []*os.File switch expectedLink.Type() { case (&netlink.Bridge{}).Type(): newLink = &netlink.Bridge{ LinkAttrs: netlink.LinkAttrs{Name: name}, MulticastSnooping: expectedLink.(*netlink.Bridge).MulticastSnooping, } case (&netlink.Tuntap{}).Type(): newLink = &netlink.Tuntap{ LinkAttrs: netlink.LinkAttrs{Name: name}, Mode: netlink.TUNTAP_MODE_TAP, Queues: defaultQueues, Flags: netlink.TUNTAP_MULTI_QUEUE_DEFAULTS | netlink.TUNTAP_VNET_HDR, } case (&netlink.Macvtap{}).Type(): qlen := expectedLink.Attrs().TxQLen if qlen <= 0 { qlen = defaultQlen } newLink = &netlink.Macvtap{ Macvlan: netlink.Macvlan{ Mode: netlink.MACVLAN_MODE_BRIDGE, LinkAttrs: netlink.LinkAttrs{ Index: expectedLink.Attrs().Index, Name: name, TxQLen: qlen, ParentIndex: expectedLink.Attrs().ParentIndex, }, }, } default: return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) } if err := netHandle.LinkAdd(newLink); err != nil { return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err) } tuntapLink, ok := newLink.(*netlink.Tuntap) if ok { fds = tuntapLink.Fds } newLink, err := getLinkByName(netHandle, name, expectedLink) return newLink, fds, err } func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) { link, err := netHandle.LinkByName(name) if err != nil { return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err) } switch expectedLink.Type() { case (&netlink.Bridge{}).Type(): if l, ok := link.(*netlink.Bridge); ok { return l, nil } case (&netlink.Tuntap{}).Type(): if l, ok := link.(*netlink.GenericLink); ok { return l, nil } case (&netlink.Veth{}).Type(): if l, ok := link.(*netlink.Veth); ok { return l, nil } case (&netlink.Macvtap{}).Type(): if l, ok := link.(*netlink.Macvtap); ok { return l, nil } default: return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) } return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type()) } // The endpoint type should dictate how the connection needs to be made func xconnectVMNetwork(netPair *NetworkInterfacePair, connect bool) error { if netPair.NetInterworkingModel == NetXConnectDefaultModel { netPair.NetInterworkingModel = DefaultNetInterworkingModel } switch netPair.NetInterworkingModel { case NetXConnectBridgedModel: netPair.NetInterworkingModel = NetXConnectBridgedModel if connect { return bridgeNetworkPair(netPair) } return unBridgeNetworkPair(*netPair) case NetXConnectMacVtapModel: netPair.NetInterworkingModel = NetXConnectMacVtapModel if connect { return tapNetworkPair(netPair) } return untapNetworkPair(*netPair) case NetXConnectEnlightenedModel: return fmt.Errorf("Unsupported networking model") default: return fmt.Errorf("Invalid internetworking model") } } func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) { tapDev := fmt.Sprintf("/dev/tap%d", linkIndex) return createFds(tapDev, queues) } func createVhostFds(numFds int) ([]*os.File, error) { vhostDev := "/dev/vhost-net" return createFds(vhostDev, numFds) } func createFds(device string, numFds int) ([]*os.File, error) { fds := make([]*os.File, numFds) for i := 0; i < numFds; i++ { f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms) if err != nil { cleanupFds(fds, i) return nil, err } fds[i] = f } return fds, nil } // There is a limitation in the linux kernel that prevents a macvtap/macvlan link // from getting the correct link index when created in a network namespace // https://github.com/clearcontainers/runtime/issues/708 // // Till that bug is fixed we need to pick a random non conflicting index and try to // create a link. If that fails, we need to try with another. // All the kernel does not check if the link id conflicts with a link id on the host // hence we need to offset the link id to prevent any overlaps with the host index // // Here the kernel will ensure that there is no race condition const hostLinkOffset = 8192 // Host should not have more than 8k interfaces const linkRange = 0xFFFF // This will allow upto 2^16 containers const linkRetries = 128 // The numbers of time we try to find a non conflicting index const macvtapWorkaround = true func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link) (taplink netlink.Link, err error) { if !macvtapWorkaround { taplink, _, err = createLink(netHandle, name, link) return } r := rand.New(rand.NewSource(time.Now().UnixNano())) for i := 0; i < linkRetries; i++ { index := hostLinkOffset + (r.Int() & linkRange) link.Attrs().Index = index taplink, _, err = createLink(netHandle, name, link) if err == nil { break } } return } func clearIPs(link netlink.Link, addrs []netlink.Addr) error { for _, addr := range addrs { if err := netlink.AddrDel(link, &addr); err != nil { return err } } return nil } func setIPs(link netlink.Link, addrs []netlink.Addr) error { for _, addr := range addrs { if err := netlink.AddrAdd(link, &addr); err != nil { return err } } return nil } func tapNetworkPair(netPair *NetworkInterfacePair) error { netHandle, err := netlink.NewHandle() if err != nil { return err } defer netHandle.Delete() vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{}) if err != nil { return fmt.Errorf("Could not get veth interface: %s: %s", netPair.VirtIface.Name, err) } vethLinkAttrs := vethLink.Attrs() // Attach the macvtap interface to the underlying container // interface. Also picks relevant attributes from the parent tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{ Macvlan: netlink.Macvlan{ LinkAttrs: netlink.LinkAttrs{ TxQLen: vethLinkAttrs.TxQLen, ParentIndex: vethLinkAttrs.Index, }, }, }) if err != nil { return fmt.Errorf("Could not create TAP interface: %s", err) } // Save the veth MAC address to the TAP so that it can later be used // to build the hypervisor command line. This MAC address has to be // the one inside the VM in order to avoid any firewall issues. The // bridge created by the network plugin on the host actually expects // to see traffic from this MAC address and not another one. tapHardAddr := vethLinkAttrs.HardwareAddr netPair.TAPIface.HardAddr = vethLinkAttrs.HardwareAddr.String() if err := netHandle.LinkSetMTU(tapLink, vethLinkAttrs.MTU); err != nil { return fmt.Errorf("Could not set TAP MTU %d: %s", vethLinkAttrs.MTU, err) } hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr) if err != nil { return err } if err := netHandle.LinkSetHardwareAddr(vethLink, hardAddr); err != nil { return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) } if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil { return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) } if err := netHandle.LinkSetUp(tapLink); err != nil { return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) } // Clear the IP addresses from the veth interface to prevent ARP conflict netPair.VirtIface.Addrs, err = netlink.AddrList(vethLink, netlink.FAMILY_V4) if err != nil { return fmt.Errorf("Unable to obtain veth IP addresses: %s", err) } if err := clearIPs(vethLink, netPair.VirtIface.Addrs); err != nil { return fmt.Errorf("Unable to clear veth IP addresses: %s", err) } if err := netHandle.LinkSetUp(vethLink); err != nil { return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err) } // Note: The underlying interfaces need to be up prior to fd creation. // Setup the multiqueue fds to be consumed by QEMU as macvtap cannot // be directly connected. // Ideally we want // netdev.FDs, err = createMacvtapFds(netdev.ID, int(config.SMP.CPUs)) // We do not have global context here, hence a manifest constant // that matches our minimum vCPU configuration // Another option is to defer this to ciao qemu library which does have // global context but cannot handle errors when setting up the network netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, defaultQueues) if err != nil { return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err) } vhostFds, err := createVhostFds(defaultQueues) if err != nil { return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) } netPair.VhostFds = vhostFds return nil } func bridgeNetworkPair(netPair *NetworkInterfacePair) error { netHandle, err := netlink.NewHandle() if err != nil { return err } defer netHandle.Delete() tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}) if err != nil { return fmt.Errorf("Could not create TAP interface: %s", err) } netPair.VMFds = fds vhostFds, err := createVhostFds(defaultQueues) if err != nil { return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) } netPair.VhostFds = vhostFds vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{}) if err != nil { return fmt.Errorf("Could not get veth interface %s : %s", netPair.VirtIface.Name, err) } vethLinkAttrs := vethLink.Attrs() // Save the veth MAC address to the TAP so that it can later be used // to build the hypervisor command line. This MAC address has to be // the one inside the VM in order to avoid any firewall issues. The // bridge created by the network plugin on the host actually expects // to see traffic from this MAC address and not another one. netPair.TAPIface.HardAddr = vethLinkAttrs.HardwareAddr.String() if err := netHandle.LinkSetMTU(tapLink, vethLinkAttrs.MTU); err != nil { return fmt.Errorf("Could not set TAP MTU %d: %s", vethLinkAttrs.MTU, err) } hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr) if err != nil { return err } if err := netHandle.LinkSetHardwareAddr(vethLink, hardAddr); err != nil { return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) } mcastSnoop := false bridgeLink, _, err := createLink(netHandle, netPair.Name, &netlink.Bridge{MulticastSnooping: &mcastSnoop}) if err != nil { return fmt.Errorf("Could not create bridge: %s", err) } if err := netHandle.LinkSetMaster(tapLink, bridgeLink.(*netlink.Bridge)); err != nil { return fmt.Errorf("Could not attach TAP %s to the bridge %s: %s", netPair.TAPIface.Name, netPair.Name, err) } if err := netHandle.LinkSetUp(tapLink); err != nil { return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) } if err := netHandle.LinkSetMaster(vethLink, bridgeLink.(*netlink.Bridge)); err != nil { return fmt.Errorf("Could not attach veth %s to the bridge %s: %s", netPair.VirtIface.Name, netPair.Name, err) } if err := netHandle.LinkSetUp(vethLink); err != nil { return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err) } if err := netHandle.LinkSetUp(bridgeLink); err != nil { return fmt.Errorf("Could not enable bridge %s: %s", netPair.Name, err) } return nil } func untapNetworkPair(netPair NetworkInterfacePair) error { netHandle, err := netlink.NewHandle() if err != nil { return err } defer netHandle.Delete() tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{}) if err != nil { return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err) } if err := netHandle.LinkDel(tapLink); err != nil { return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) } vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{}) if err != nil { // The veth pair is not totally managed by virtcontainers virtLog.Warnf("Could not get veth interface %s: %s", netPair.VirtIface.Name, err) } else { if err := netHandle.LinkSetDown(vethLink); err != nil { return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) } } // Restore the IPs that were cleared err = setIPs(vethLink, netPair.VirtIface.Addrs) return err } func unBridgeNetworkPair(netPair NetworkInterfacePair) error { netHandle, err := netlink.NewHandle() if err != nil { return err } defer netHandle.Delete() tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}) if err != nil { return fmt.Errorf("Could not get TAP interface: %s", err) } bridgeLink, err := getLinkByName(netHandle, netPair.Name, &netlink.Bridge{}) if err != nil { return fmt.Errorf("Could not get bridge interface: %s", err) } if err := netHandle.LinkSetDown(bridgeLink); err != nil { return fmt.Errorf("Could not disable bridge %s: %s", netPair.Name, err) } if err := netHandle.LinkSetDown(tapLink); err != nil { return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err) } if err := netHandle.LinkSetNoMaster(tapLink); err != nil { return fmt.Errorf("Could not detach TAP %s: %s", netPair.TAPIface.Name, err) } if err := netHandle.LinkDel(bridgeLink); err != nil { return fmt.Errorf("Could not remove bridge %s: %s", netPair.Name, err) } if err := netHandle.LinkDel(tapLink); err != nil { return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) } vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{}) if err != nil { // The veth pair is not totally managed by virtcontainers virtLog.WithError(err).Warn("Could not get veth interface") } else { if err := netHandle.LinkSetDown(vethLink); err != nil { return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) } if err := netHandle.LinkSetNoMaster(vethLink); err != nil { return fmt.Errorf("Could not detach veth %s: %s", netPair.VirtIface.Name, err) } } return nil } func createNetNS() (string, error) { n, err := ns.NewNS() if err != nil { return "", err } return n.Path(), nil } // doNetNS is free from any call to a go routine, and it calls // into runtime.LockOSThread(), meaning it won't be executed in a // different thread than the one expected by the caller. func doNetNS(netNSPath string, cb func(ns.NetNS) error) error { runtime.LockOSThread() defer runtime.UnlockOSThread() currentNS, err := ns.GetCurrentNS() if err != nil { return err } defer currentNS.Close() targetNS, err := ns.GetNS(netNSPath) if err != nil { return err } if err := targetNS.Set(); err != nil { return err } defer currentNS.Set() return cb(targetNS) } func deleteNetNS(netNSPath string, mounted bool) error { n, err := ns.GetNS(netNSPath) if err != nil { return err } err = n.Close() if err != nil { return err } // This unmount part is supposed to be done in the cni/ns package, but the "mounted" // flag is not updated when retrieving NetNs handler from GetNS(). if mounted { if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil { return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err) } if err := os.RemoveAll(netNSPath); err != nil { return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err) } } return nil } func createVirtualNetworkEndpoint(idx int, ifName string, interworkingModel NetInterworkingModel) (*VirtualEndpoint, error) { if idx < 0 { return &VirtualEndpoint{}, fmt.Errorf("invalid network endpoint index: %d", idx) } uniqueID := uuid.Generate().String() hardAddr := net.HardwareAddr{0x02, 0x00, 0xCA, 0xFE, byte(idx >> 8), byte(idx)} endpoint := &VirtualEndpoint{ // TODO This is too specific. We may need to create multiple // end point types here and then decide how to connect them // at the time of hypervisor attach and not here NetPair: NetworkInterfacePair{ ID: uniqueID, Name: fmt.Sprintf("br%d", idx), VirtIface: NetworkInterface{ Name: fmt.Sprintf("eth%d", idx), HardAddr: hardAddr.String(), }, TAPIface: NetworkInterface{ Name: fmt.Sprintf("tap%d", idx), }, NetInterworkingModel: interworkingModel, }, EndpointType: VirtualEndpointType, } if ifName != "" { endpoint.NetPair.VirtIface.Name = ifName } return endpoint, nil } func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) { addrs, err := handle.AddrList(link, netlink.FAMILY_ALL) if err != nil { return NetworkInfo{}, err } routes, err := handle.RouteList(link, netlink.FAMILY_ALL) if err != nil { return NetworkInfo{}, err } return NetworkInfo{ Iface: NetlinkIface{ LinkAttrs: *(link.Attrs()), Type: link.Type(), }, Addrs: addrs, Routes: routes, }, nil } func createEndpointsFromScan(networkNSPath string, config NetworkConfig) ([]Endpoint, error) { var endpoints []Endpoint netnsHandle, err := netns.GetFromPath(networkNSPath) if err != nil { return []Endpoint{}, err } defer netnsHandle.Close() netlinkHandle, err := netlink.NewHandleAt(netnsHandle) if err != nil { return []Endpoint{}, err } defer netlinkHandle.Delete() linkList, err := netlinkHandle.LinkList() if err != nil { return []Endpoint{}, err } idx := 0 for _, link := range linkList { var endpoint Endpoint netInfo, err := networkInfoFromLink(netlinkHandle, link) if err != nil { return []Endpoint{}, err } // Ignore unconfigured network interfaces. These are // either base tunnel devices that are not namespaced // like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly // setup interfaces. if len(netInfo.Addrs) == 0 { continue } // Skip any loopback interfaces: if (netInfo.Iface.Flags & net.FlagLoopback) != 0 { continue } if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { // TODO: This is the incoming interface // based on the incoming interface we should create // an appropriate EndPoint based on interface type // This should be a switch // Check if interface is a physical interface. Do not create // tap interface/bridge if it is. isPhysical, err := isPhysicalIface(netInfo.Iface.Name) if err != nil { return err } if isPhysical { cnmLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found") endpoint, err = createPhysicalEndpoint(netInfo) } else { var socketPath string // Check if this is a dummy interface which has a vhost-user socket associated with it socketPath, err = vhostUserSocketPath(netInfo) if err != nil { return err } if socketPath != "" { cnmLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found") endpoint, err = createVhostUserEndpoint(netInfo, socketPath) } else { endpoint, err = createVirtualNetworkEndpoint(idx, netInfo.Iface.Name, config.InterworkingModel) } } return err }); err != nil { return []Endpoint{}, err } endpoint.SetProperties(netInfo) endpoints = append(endpoints, endpoint) idx++ } return endpoints, nil } // isPhysicalIface checks if an interface is a physical device. // We use ethtool here to not rely on device sysfs inside the network namespace. func isPhysicalIface(ifaceName string) (bool, error) { if ifaceName == "lo" { return false, nil } ethHandle, err := ethtool.NewEthtool() if err != nil { return false, err } bus, err := ethHandle.BusInfo(ifaceName) if err != nil { return false, nil } // Check for a pci bus format tokens := strings.Split(bus, ":") if len(tokens) != 3 { return false, nil } return true, nil } var sysPCIDevicesPath = "/sys/bus/pci/devices" func createPhysicalEndpoint(netInfo NetworkInfo) (*PhysicalEndpoint, error) { // Get ethtool handle to derive driver and bus ethHandle, err := ethtool.NewEthtool() if err != nil { return nil, err } // Get BDF bdf, err := ethHandle.BusInfo(netInfo.Iface.Name) if err != nil { return nil, err } // Get Driver driver, err := ethHandle.DriverName(netInfo.Iface.Name) if err != nil { return nil, err } // Get vendor and device id from pci space (sys/bus/pci/devices/$bdf) ifaceDevicePath := filepath.Join(sysPCIDevicesPath, bdf, "device") contents, err := ioutil.ReadFile(ifaceDevicePath) if err != nil { return nil, err } deviceID := strings.TrimSpace(string(contents)) // Vendor id ifaceVendorPath := filepath.Join(sysPCIDevicesPath, bdf, "vendor") contents, err = ioutil.ReadFile(ifaceVendorPath) if err != nil { return nil, err } vendorID := strings.TrimSpace(string(contents)) vendorDeviceID := fmt.Sprintf("%s %s", vendorID, deviceID) vendorDeviceID = strings.TrimSpace(vendorDeviceID) physicalEndpoint := &PhysicalEndpoint{ IfaceName: netInfo.Iface.Name, HardAddr: netInfo.Iface.HardwareAddr.String(), VendorDeviceID: vendorDeviceID, EndpointType: PhysicalEndpointType, Driver: driver, BDF: bdf, } return physicalEndpoint, nil } func bindNICToVFIO(endpoint *PhysicalEndpoint) error { return bindDevicetoVFIO(endpoint.BDF, endpoint.Driver, endpoint.VendorDeviceID) } func bindNICToHost(endpoint *PhysicalEndpoint) error { return bindDevicetoHost(endpoint.BDF, endpoint.Driver, endpoint.VendorDeviceID) } // network is the virtcontainers network interface. // Container network plugins are used to setup virtual network // between VM netns and the host network physical interface. type network interface { // init initializes the network, setting a new network namespace. init(config NetworkConfig) (string, bool, error) // run runs a callback function in a specified network namespace. run(networkNSPath string, cb func() error) error // add adds all needed interfaces inside the network namespace. add(pod Pod, config NetworkConfig, netNsPath string, netNsCreated bool) (NetworkNamespace, error) // remove unbridges and deletes TAP interfaces. It also removes virtual network // interfaces and deletes the network namespace. remove(pod Pod, networkNS NetworkNamespace) error }