Files
kata-containers/virtcontainers/network.go
Samuel Ortiz be72b6bd31 virtcontainers: Change all import paths
Some virtcontainers pieces of code are importing virtcontainers
packages. We need to change those paths to point at
kata-containers/runtime/virtcontainers

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
2018-03-13 01:00:52 +01:00

1395 lines
38 KiB
Go

//
// Copyright (c) 2016 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package virtcontainers
import (
"encoding/hex"
"encoding/json"
"fmt"
"io/ioutil"
"math/rand"
"net"
"os"
"path/filepath"
"runtime"
"strings"
"time"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/kata-containers/runtime/virtcontainers/pkg/ethtool"
"github.com/kata-containers/runtime/virtcontainers/pkg/uuid"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
"golang.org/x/sys/unix"
)
// NetInterworkingModel defines the network model connecting
// the network interface to the virtual machine.
type NetInterworkingModel int
const (
// NetXConnectDefaultModel Ask to use DefaultNetInterworkingModel
NetXConnectDefaultModel NetInterworkingModel = iota
// NetXConnectBridgedModel uses a linux bridge to interconnect
// the container interface to the VM. This is the
// safe default that works for most cases except
// macvlan and ipvlan
NetXConnectBridgedModel
// NetXConnectMacVtapModel can be used when the Container network
// interface can be bridged using macvtap
NetXConnectMacVtapModel
// NetXConnectEnlightenedModel can be used when the Network plugins
// are enlightened to create VM native interfaces
// when requested by the runtime
// This will be used for vethtap, macvtap, ipvtap
NetXConnectEnlightenedModel
// NetXConnectInvalidModel is the last item to check valid values by IsValid()
NetXConnectInvalidModel
)
//IsValid checks if a model is valid
func (n NetInterworkingModel) IsValid() bool {
return 0 <= int(n) && int(n) < int(NetXConnectInvalidModel)
}
//SetModel change the model string value
func (n *NetInterworkingModel) SetModel(modelName string) error {
switch modelName {
case "default":
*n = DefaultNetInterworkingModel
return nil
case "bridged":
*n = NetXConnectBridgedModel
return nil
case "macvtap":
*n = NetXConnectMacVtapModel
return nil
case "enlightened":
*n = NetXConnectEnlightenedModel
return nil
}
return fmt.Errorf("Unknown type %s", modelName)
}
// DefaultNetInterworkingModel is a package level default
// that determines how the VM should be connected to the
// the container network interface
var DefaultNetInterworkingModel = NetXConnectMacVtapModel
// Introduces constants related to networking
const (
defaultRouteDest = "0.0.0.0/0"
defaultRouteLabel = "default"
defaultFilePerms = 0600
defaultQlen = 1500
defaultQueues = 8
)
// DNSInfo describes the DNS setup related to a network interface.
type DNSInfo struct {
Servers []string
Domain string
Searches []string
Options []string
}
// NetlinkIface describes fully a network interface.
type NetlinkIface struct {
netlink.LinkAttrs
Type string
}
// NetworkInfo gathers all information related to a network interface.
// It can be used to store the description of the underlying network.
type NetworkInfo struct {
Iface NetlinkIface
Addrs []netlink.Addr
Routes []netlink.Route
DNS DNSInfo
}
// NetworkInterface defines a network interface.
type NetworkInterface struct {
Name string
HardAddr string
Addrs []netlink.Addr
}
// NetworkInterfacePair defines a pair between VM and virtual network interfaces.
type NetworkInterfacePair struct {
ID string
Name string
VirtIface NetworkInterface
TAPIface NetworkInterface
NetInterworkingModel
VMFds []*os.File
VhostFds []*os.File
}
// NetworkConfig is the network configuration related to a network.
type NetworkConfig struct {
NetNSPath string
NumInterfaces int
InterworkingModel NetInterworkingModel
}
// Endpoint represents a physical or virtual network interface.
type Endpoint interface {
Properties() NetworkInfo
Name() string
HardwareAddr() string
Type() EndpointType
SetProperties(NetworkInfo)
Attach(hypervisor) error
Detach() error
}
// VirtualEndpoint gathers a network pair and its properties.
type VirtualEndpoint struct {
NetPair NetworkInterfacePair
EndpointProperties NetworkInfo
Physical bool
EndpointType EndpointType
}
// PhysicalEndpoint gathers a physical network interface and its properties
type PhysicalEndpoint struct {
IfaceName string
HardAddr string
EndpointProperties NetworkInfo
EndpointType EndpointType
BDF string
Driver string
VendorDeviceID string
}
// VhostUserEndpoint represents a vhost-user socket based network interface
type VhostUserEndpoint struct {
// Path to the vhost-user socket on the host system
SocketPath string
// MAC address of the interface
HardAddr string
IfaceName string
EndpointProperties NetworkInfo
EndpointType EndpointType
}
// Properties returns properties for the veth interface in the network pair.
func (endpoint *VirtualEndpoint) Properties() NetworkInfo {
return endpoint.EndpointProperties
}
// Name returns name of the veth interface in the network pair.
func (endpoint *VirtualEndpoint) Name() string {
return endpoint.NetPair.VirtIface.Name
}
// HardwareAddr returns the mac address that is assigned to the tap interface
// in th network pair.
func (endpoint *VirtualEndpoint) HardwareAddr() string {
return endpoint.NetPair.TAPIface.HardAddr
}
// Type identifies the endpoint as a virtual endpoint.
func (endpoint *VirtualEndpoint) Type() EndpointType {
return endpoint.EndpointType
}
// SetProperties sets the properties for the endpoint.
func (endpoint *VirtualEndpoint) SetProperties(properties NetworkInfo) {
endpoint.EndpointProperties = properties
}
func networkLogger() *logrus.Entry {
return virtLog.WithField("subsystem", "network")
}
// Attach for virtual endpoint bridges the network pair and adds the
// tap interface of the network pair to the hypervisor.
func (endpoint *VirtualEndpoint) Attach(h hypervisor) error {
networkLogger().Info("Attaching virtual endpoint")
if err := xconnectVMNetwork(&(endpoint.NetPair), true); err != nil {
networkLogger().WithError(err).Error("Error bridging virtual ep")
return err
}
return h.addDevice(endpoint, netDev)
}
// Detach for the virtual endpoint tears down the tap and bridge
// created for the veth interface.
func (endpoint *VirtualEndpoint) Detach() error {
networkLogger().Info("Detaching virtual endpoint")
return xconnectVMNetwork(&(endpoint.NetPair), false)
}
// Properties returns the properties of the interface.
func (endpoint *VhostUserEndpoint) Properties() NetworkInfo {
return endpoint.EndpointProperties
}
// Name returns name of the interface.
func (endpoint *VhostUserEndpoint) Name() string {
return endpoint.IfaceName
}
// HardwareAddr returns the mac address of the vhostuser network interface
func (endpoint *VhostUserEndpoint) HardwareAddr() string {
return endpoint.HardAddr
}
// Type indentifies the endpoint as a vhostuser endpoint.
func (endpoint *VhostUserEndpoint) Type() EndpointType {
return endpoint.EndpointType
}
// SetProperties sets the properties of the endpoint.
func (endpoint *VhostUserEndpoint) SetProperties(properties NetworkInfo) {
endpoint.EndpointProperties = properties
}
// Attach for vhostuser endpoint
func (endpoint *VhostUserEndpoint) Attach(h hypervisor) error {
networkLogger().Info("Attaching vhostuser based endpoint")
// generate a unique ID to be used for hypervisor commandline fields
randBytes, err := generateRandomBytes(8)
if err != nil {
return err
}
id := hex.EncodeToString(randBytes)
d := VhostUserNetDevice{
MacAddress: endpoint.HardAddr,
}
d.SocketPath = endpoint.SocketPath
d.ID = id
return h.addDevice(d, vhostuserDev)
}
// Detach for vhostuser endpoint
func (endpoint *VhostUserEndpoint) Detach() error {
networkLogger().Info("Detaching vhostuser based endpoint")
return nil
}
// Create a vhostuser endpoint
func createVhostUserEndpoint(netInfo NetworkInfo, socket string) (*VhostUserEndpoint, error) {
vhostUserEndpoint := &VhostUserEndpoint{
SocketPath: socket,
HardAddr: netInfo.Iface.HardwareAddr.String(),
IfaceName: netInfo.Iface.Name,
EndpointType: VhostUserEndpointType,
}
return vhostUserEndpoint, nil
}
// Properties returns the properties of the physical interface.
func (endpoint *PhysicalEndpoint) Properties() NetworkInfo {
return endpoint.EndpointProperties
}
// HardwareAddr returns the mac address of the physical network interface.
func (endpoint *PhysicalEndpoint) HardwareAddr() string {
return endpoint.HardAddr
}
// Name returns name of the physical interface.
func (endpoint *PhysicalEndpoint) Name() string {
return endpoint.IfaceName
}
// Type indentifies the endpoint as a physical endpoint.
func (endpoint *PhysicalEndpoint) Type() EndpointType {
return endpoint.EndpointType
}
// SetProperties sets the properties of the physical endpoint.
func (endpoint *PhysicalEndpoint) SetProperties(properties NetworkInfo) {
endpoint.EndpointProperties = properties
}
// Attach for physical endpoint binds the physical network interface to
// vfio-pci and adds device to the hypervisor with vfio-passthrough.
func (endpoint *PhysicalEndpoint) Attach(h hypervisor) error {
networkLogger().Info("Attaching physical endpoint")
// Unbind physical interface from host driver and bind to vfio
// so that it can be passed to qemu.
if err := bindNICToVFIO(endpoint); err != nil {
return err
}
d := VFIODevice{
BDF: endpoint.BDF,
}
return h.addDevice(d, vfioDev)
}
// Detach for physical endpoint unbinds the physical network interface from vfio-pci
// and binds it back to the saved host driver.
func (endpoint *PhysicalEndpoint) Detach() error {
// Bind back the physical network interface to host.
networkLogger().Info("Detaching physical endpoint")
return bindNICToHost(endpoint)
}
// EndpointType identifies the type of the network endpoint.
type EndpointType string
const (
// PhysicalEndpointType is the physical network interface.
PhysicalEndpointType EndpointType = "physical"
// VirtualEndpointType is the virtual network interface.
VirtualEndpointType EndpointType = "virtual"
// VhostUserEndpointType is the vhostuser network interface.
VhostUserEndpointType EndpointType = "vhost-user"
)
// Set sets an endpoint type based on the input string.
func (endpointType *EndpointType) Set(value string) error {
switch value {
case "physical":
*endpointType = PhysicalEndpointType
return nil
case "virtual":
*endpointType = VirtualEndpointType
return nil
case "vhost-user":
*endpointType = VhostUserEndpointType
return nil
default:
return fmt.Errorf("Unknown endpoint type %s", value)
}
}
// String converts an endpoint type to a string.
func (endpointType *EndpointType) String() string {
switch *endpointType {
case PhysicalEndpointType:
return string(PhysicalEndpointType)
case VirtualEndpointType:
return string(VirtualEndpointType)
case VhostUserEndpointType:
return string(VhostUserEndpointType)
default:
return ""
}
}
// NetworkNamespace contains all data related to its network namespace.
type NetworkNamespace struct {
NetNsPath string
NetNsCreated bool
Endpoints []Endpoint
}
// TypedJSONEndpoint is used as an intermediate representation for
// marshalling and unmarshalling Endpoint objects.
type TypedJSONEndpoint struct {
Type EndpointType
Data json.RawMessage
}
// MarshalJSON is the custom NetworkNamespace JSON marshalling routine.
// This is needed to properly marshall Endpoints array.
func (n NetworkNamespace) MarshalJSON() ([]byte, error) {
// We need a shadow structure in order to prevent json from
// entering a recursive loop when only calling json.Marshal().
type shadow struct {
NetNsPath string
NetNsCreated bool
Endpoints []TypedJSONEndpoint
}
s := &shadow{
NetNsPath: n.NetNsPath,
NetNsCreated: n.NetNsCreated,
}
var typedEndpoints []TypedJSONEndpoint
for _, endpoint := range n.Endpoints {
tempJSON, _ := json.Marshal(endpoint)
t := TypedJSONEndpoint{
Type: endpoint.Type(),
Data: tempJSON,
}
typedEndpoints = append(typedEndpoints, t)
}
s.Endpoints = typedEndpoints
b, err := json.Marshal(s)
return b, err
}
// UnmarshalJSON is the custom NetworkNamespace unmarshalling routine.
// This is needed for unmarshalling the Endpoints interfaces array.
func (n *NetworkNamespace) UnmarshalJSON(b []byte) error {
type tmp NetworkNamespace
var s struct {
NetNsPath string
NetNsCreated bool
Endpoints json.RawMessage
}
if err := json.Unmarshal(b, &s); err != nil {
return err
}
(*n).NetNsPath = s.NetNsPath
(*n).NetNsCreated = s.NetNsCreated
var typedEndpoints []TypedJSONEndpoint
if err := json.Unmarshal([]byte(string(s.Endpoints)), &typedEndpoints); err != nil {
return err
}
var endpoints []Endpoint
for _, e := range typedEndpoints {
switch e.Type {
case PhysicalEndpointType:
var endpoint PhysicalEndpoint
err := json.Unmarshal(e.Data, &endpoint)
if err != nil {
return err
}
endpoints = append(endpoints, &endpoint)
virtLog.Infof("Physical endpoint unmarshalled [%v]", endpoint)
case VirtualEndpointType:
var endpoint VirtualEndpoint
err := json.Unmarshal(e.Data, &endpoint)
if err != nil {
return err
}
endpoints = append(endpoints, &endpoint)
virtLog.Infof("Virtual endpoint unmarshalled [%v]", endpoint)
case VhostUserEndpointType:
var endpoint VhostUserEndpoint
err := json.Unmarshal(e.Data, &endpoint)
if err != nil {
return err
}
endpoints = append(endpoints, &endpoint)
virtLog.Infof("VhostUser endpoint unmarshalled [%v]", endpoint)
default:
virtLog.Errorf("Unknown endpoint type received %s\n", e.Type)
}
}
(*n).Endpoints = endpoints
return nil
}
// NetworkModel describes the type of network specification.
type NetworkModel string
const (
// NoopNetworkModel is the No-Op network.
NoopNetworkModel NetworkModel = "noop"
// CNINetworkModel is the CNI network.
CNINetworkModel NetworkModel = "CNI"
// CNMNetworkModel is the CNM network.
CNMNetworkModel NetworkModel = "CNM"
)
// Set sets a network type based on the input string.
func (networkType *NetworkModel) Set(value string) error {
switch value {
case "noop":
*networkType = NoopNetworkModel
return nil
case "CNI":
*networkType = CNINetworkModel
return nil
case "CNM":
*networkType = CNMNetworkModel
return nil
default:
return fmt.Errorf("Unknown network type %s", value)
}
}
// String converts a network type to a string.
func (networkType *NetworkModel) String() string {
switch *networkType {
case NoopNetworkModel:
return string(NoopNetworkModel)
case CNINetworkModel:
return string(CNINetworkModel)
case CNMNetworkModel:
return string(CNMNetworkModel)
default:
return ""
}
}
// newNetwork returns a network from a network type.
func newNetwork(networkType NetworkModel) network {
switch networkType {
case NoopNetworkModel:
return &noopNetwork{}
case CNINetworkModel:
return &cni{}
case CNMNetworkModel:
return &cnm{}
default:
return &noopNetwork{}
}
}
func initNetworkCommon(config NetworkConfig) (string, bool, error) {
if !config.InterworkingModel.IsValid() || config.InterworkingModel == NetXConnectDefaultModel {
config.InterworkingModel = DefaultNetInterworkingModel
}
if config.NetNSPath == "" {
path, err := createNetNS()
if err != nil {
return "", false, err
}
return path, true, nil
}
return config.NetNSPath, false, nil
}
func runNetworkCommon(networkNSPath string, cb func() error) error {
if networkNSPath == "" {
return fmt.Errorf("networkNSPath cannot be empty")
}
return doNetNS(networkNSPath, func(_ ns.NetNS) error {
return cb()
})
}
func addNetworkCommon(pod Pod, networkNS *NetworkNamespace) error {
err := doNetNS(networkNS.NetNsPath, func(_ ns.NetNS) error {
for _, endpoint := range networkNS.Endpoints {
if err := endpoint.Attach(pod.hypervisor); err != nil {
return err
}
}
return nil
})
return err
}
func removeNetworkCommon(networkNS NetworkNamespace) error {
return doNetNS(networkNS.NetNsPath, func(_ ns.NetNS) error {
for _, endpoint := range networkNS.Endpoints {
if err := endpoint.Detach(); err != nil {
return err
}
}
return nil
})
}
func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, []*os.File, error) {
var newLink netlink.Link
var fds []*os.File
switch expectedLink.Type() {
case (&netlink.Bridge{}).Type():
newLink = &netlink.Bridge{
LinkAttrs: netlink.LinkAttrs{Name: name},
MulticastSnooping: expectedLink.(*netlink.Bridge).MulticastSnooping,
}
case (&netlink.Tuntap{}).Type():
newLink = &netlink.Tuntap{
LinkAttrs: netlink.LinkAttrs{Name: name},
Mode: netlink.TUNTAP_MODE_TAP,
Queues: defaultQueues,
Flags: netlink.TUNTAP_MULTI_QUEUE_DEFAULTS | netlink.TUNTAP_VNET_HDR,
}
case (&netlink.Macvtap{}).Type():
qlen := expectedLink.Attrs().TxQLen
if qlen <= 0 {
qlen = defaultQlen
}
newLink = &netlink.Macvtap{
Macvlan: netlink.Macvlan{
Mode: netlink.MACVLAN_MODE_BRIDGE,
LinkAttrs: netlink.LinkAttrs{
Index: expectedLink.Attrs().Index,
Name: name,
TxQLen: qlen,
ParentIndex: expectedLink.Attrs().ParentIndex,
},
},
}
default:
return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type())
}
if err := netHandle.LinkAdd(newLink); err != nil {
return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err)
}
tuntapLink, ok := newLink.(*netlink.Tuntap)
if ok {
fds = tuntapLink.Fds
}
newLink, err := getLinkByName(netHandle, name, expectedLink)
return newLink, fds, err
}
func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) {
link, err := netHandle.LinkByName(name)
if err != nil {
return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err)
}
switch expectedLink.Type() {
case (&netlink.Bridge{}).Type():
if l, ok := link.(*netlink.Bridge); ok {
return l, nil
}
case (&netlink.Tuntap{}).Type():
if l, ok := link.(*netlink.GenericLink); ok {
return l, nil
}
case (&netlink.Veth{}).Type():
if l, ok := link.(*netlink.Veth); ok {
return l, nil
}
case (&netlink.Macvtap{}).Type():
if l, ok := link.(*netlink.Macvtap); ok {
return l, nil
}
default:
return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type())
}
return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type())
}
// The endpoint type should dictate how the connection needs to be made
func xconnectVMNetwork(netPair *NetworkInterfacePair, connect bool) error {
if netPair.NetInterworkingModel == NetXConnectDefaultModel {
netPair.NetInterworkingModel = DefaultNetInterworkingModel
}
switch netPair.NetInterworkingModel {
case NetXConnectBridgedModel:
netPair.NetInterworkingModel = NetXConnectBridgedModel
if connect {
return bridgeNetworkPair(netPair)
}
return unBridgeNetworkPair(*netPair)
case NetXConnectMacVtapModel:
netPair.NetInterworkingModel = NetXConnectMacVtapModel
if connect {
return tapNetworkPair(netPair)
}
return untapNetworkPair(*netPair)
case NetXConnectEnlightenedModel:
return fmt.Errorf("Unsupported networking model")
default:
return fmt.Errorf("Invalid internetworking model")
}
}
func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) {
tapDev := fmt.Sprintf("/dev/tap%d", linkIndex)
return createFds(tapDev, queues)
}
func createVhostFds(numFds int) ([]*os.File, error) {
vhostDev := "/dev/vhost-net"
return createFds(vhostDev, numFds)
}
func createFds(device string, numFds int) ([]*os.File, error) {
fds := make([]*os.File, numFds)
for i := 0; i < numFds; i++ {
f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms)
if err != nil {
cleanupFds(fds, i)
return nil, err
}
fds[i] = f
}
return fds, nil
}
// There is a limitation in the linux kernel that prevents a macvtap/macvlan link
// from getting the correct link index when created in a network namespace
// https://github.com/clearcontainers/runtime/issues/708
//
// Till that bug is fixed we need to pick a random non conflicting index and try to
// create a link. If that fails, we need to try with another.
// All the kernel does not check if the link id conflicts with a link id on the host
// hence we need to offset the link id to prevent any overlaps with the host index
//
// Here the kernel will ensure that there is no race condition
const hostLinkOffset = 8192 // Host should not have more than 8k interfaces
const linkRange = 0xFFFF // This will allow upto 2^16 containers
const linkRetries = 128 // The numbers of time we try to find a non conflicting index
const macvtapWorkaround = true
func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link) (taplink netlink.Link, err error) {
if !macvtapWorkaround {
taplink, _, err = createLink(netHandle, name, link)
return
}
r := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := 0; i < linkRetries; i++ {
index := hostLinkOffset + (r.Int() & linkRange)
link.Attrs().Index = index
taplink, _, err = createLink(netHandle, name, link)
if err == nil {
break
}
}
return
}
func clearIPs(link netlink.Link, addrs []netlink.Addr) error {
for _, addr := range addrs {
if err := netlink.AddrDel(link, &addr); err != nil {
return err
}
}
return nil
}
func setIPs(link netlink.Link, addrs []netlink.Addr) error {
for _, addr := range addrs {
if err := netlink.AddrAdd(link, &addr); err != nil {
return err
}
}
return nil
}
func tapNetworkPair(netPair *NetworkInterfacePair) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{})
if err != nil {
return fmt.Errorf("Could not get veth interface: %s: %s", netPair.VirtIface.Name, err)
}
vethLinkAttrs := vethLink.Attrs()
// Attach the macvtap interface to the underlying container
// interface. Also picks relevant attributes from the parent
tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name,
&netlink.Macvtap{
Macvlan: netlink.Macvlan{
LinkAttrs: netlink.LinkAttrs{
TxQLen: vethLinkAttrs.TxQLen,
ParentIndex: vethLinkAttrs.Index,
},
},
})
if err != nil {
return fmt.Errorf("Could not create TAP interface: %s", err)
}
// Save the veth MAC address to the TAP so that it can later be used
// to build the hypervisor command line. This MAC address has to be
// the one inside the VM in order to avoid any firewall issues. The
// bridge created by the network plugin on the host actually expects
// to see traffic from this MAC address and not another one.
tapHardAddr := vethLinkAttrs.HardwareAddr
netPair.TAPIface.HardAddr = vethLinkAttrs.HardwareAddr.String()
if err := netHandle.LinkSetMTU(tapLink, vethLinkAttrs.MTU); err != nil {
return fmt.Errorf("Could not set TAP MTU %d: %s", vethLinkAttrs.MTU, err)
}
hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr)
if err != nil {
return err
}
if err := netHandle.LinkSetHardwareAddr(vethLink, hardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetUp(tapLink); err != nil {
return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err)
}
// Clear the IP addresses from the veth interface to prevent ARP conflict
netPair.VirtIface.Addrs, err = netlink.AddrList(vethLink, netlink.FAMILY_V4)
if err != nil {
return fmt.Errorf("Unable to obtain veth IP addresses: %s", err)
}
if err := clearIPs(vethLink, netPair.VirtIface.Addrs); err != nil {
return fmt.Errorf("Unable to clear veth IP addresses: %s", err)
}
if err := netHandle.LinkSetUp(vethLink); err != nil {
return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err)
}
// Note: The underlying interfaces need to be up prior to fd creation.
// Setup the multiqueue fds to be consumed by QEMU as macvtap cannot
// be directly connected.
// Ideally we want
// netdev.FDs, err = createMacvtapFds(netdev.ID, int(config.SMP.CPUs))
// We do not have global context here, hence a manifest constant
// that matches our minimum vCPU configuration
// Another option is to defer this to ciao qemu library which does have
// global context but cannot handle errors when setting up the network
netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, defaultQueues)
if err != nil {
return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err)
}
vhostFds, err := createVhostFds(defaultQueues)
if err != nil {
return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err)
}
netPair.VhostFds = vhostFds
return nil
}
func bridgeNetworkPair(netPair *NetworkInterfacePair) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{})
if err != nil {
return fmt.Errorf("Could not create TAP interface: %s", err)
}
netPair.VMFds = fds
vhostFds, err := createVhostFds(defaultQueues)
if err != nil {
return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err)
}
netPair.VhostFds = vhostFds
vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{})
if err != nil {
return fmt.Errorf("Could not get veth interface %s : %s", netPair.VirtIface.Name, err)
}
vethLinkAttrs := vethLink.Attrs()
// Save the veth MAC address to the TAP so that it can later be used
// to build the hypervisor command line. This MAC address has to be
// the one inside the VM in order to avoid any firewall issues. The
// bridge created by the network plugin on the host actually expects
// to see traffic from this MAC address and not another one.
netPair.TAPIface.HardAddr = vethLinkAttrs.HardwareAddr.String()
if err := netHandle.LinkSetMTU(tapLink, vethLinkAttrs.MTU); err != nil {
return fmt.Errorf("Could not set TAP MTU %d: %s", vethLinkAttrs.MTU, err)
}
hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr)
if err != nil {
return err
}
if err := netHandle.LinkSetHardwareAddr(vethLink, hardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
mcastSnoop := false
bridgeLink, _, err := createLink(netHandle, netPair.Name, &netlink.Bridge{MulticastSnooping: &mcastSnoop})
if err != nil {
return fmt.Errorf("Could not create bridge: %s", err)
}
if err := netHandle.LinkSetMaster(tapLink, bridgeLink.(*netlink.Bridge)); err != nil {
return fmt.Errorf("Could not attach TAP %s to the bridge %s: %s",
netPair.TAPIface.Name, netPair.Name, err)
}
if err := netHandle.LinkSetUp(tapLink); err != nil {
return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkSetMaster(vethLink, bridgeLink.(*netlink.Bridge)); err != nil {
return fmt.Errorf("Could not attach veth %s to the bridge %s: %s",
netPair.VirtIface.Name, netPair.Name, err)
}
if err := netHandle.LinkSetUp(vethLink); err != nil {
return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetUp(bridgeLink); err != nil {
return fmt.Errorf("Could not enable bridge %s: %s", netPair.Name, err)
}
return nil
}
func untapNetworkPair(netPair NetworkInterfacePair) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{})
if err != nil {
return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkDel(tapLink); err != nil {
return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err)
}
vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{})
if err != nil {
// The veth pair is not totally managed by virtcontainers
virtLog.Warnf("Could not get veth interface %s: %s", netPair.VirtIface.Name, err)
} else {
if err := netHandle.LinkSetDown(vethLink); err != nil {
return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err)
}
}
// Restore the IPs that were cleared
err = setIPs(vethLink, netPair.VirtIface.Addrs)
return err
}
func unBridgeNetworkPair(netPair NetworkInterfacePair) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{})
if err != nil {
return fmt.Errorf("Could not get TAP interface: %s", err)
}
bridgeLink, err := getLinkByName(netHandle, netPair.Name, &netlink.Bridge{})
if err != nil {
return fmt.Errorf("Could not get bridge interface: %s", err)
}
if err := netHandle.LinkSetDown(bridgeLink); err != nil {
return fmt.Errorf("Could not disable bridge %s: %s", netPair.Name, err)
}
if err := netHandle.LinkSetDown(tapLink); err != nil {
return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkSetNoMaster(tapLink); err != nil {
return fmt.Errorf("Could not detach TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkDel(bridgeLink); err != nil {
return fmt.Errorf("Could not remove bridge %s: %s", netPair.Name, err)
}
if err := netHandle.LinkDel(tapLink); err != nil {
return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err)
}
vethLink, err := getLinkByName(netHandle, netPair.VirtIface.Name, &netlink.Veth{})
if err != nil {
// The veth pair is not totally managed by virtcontainers
virtLog.WithError(err).Warn("Could not get veth interface")
} else {
if err := netHandle.LinkSetDown(vethLink); err != nil {
return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetNoMaster(vethLink); err != nil {
return fmt.Errorf("Could not detach veth %s: %s", netPair.VirtIface.Name, err)
}
}
return nil
}
func createNetNS() (string, error) {
n, err := ns.NewNS()
if err != nil {
return "", err
}
return n.Path(), nil
}
func setNetNS(netNSPath string) error {
n, err := ns.GetNS(netNSPath)
if err != nil {
return err
}
return n.Set()
}
// doNetNS is free from any call to a go routine, and it calls
// into runtime.LockOSThread(), meaning it won't be executed in a
// different thread than the one expected by the caller.
func doNetNS(netNSPath string, cb func(ns.NetNS) error) error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
currentNS, err := ns.GetCurrentNS()
if err != nil {
return err
}
defer currentNS.Close()
targetNS, err := ns.GetNS(netNSPath)
if err != nil {
return err
}
if err := targetNS.Set(); err != nil {
return err
}
defer currentNS.Set()
return cb(targetNS)
}
func deleteNetNS(netNSPath string, mounted bool) error {
n, err := ns.GetNS(netNSPath)
if err != nil {
return err
}
err = n.Close()
if err != nil {
return err
}
// This unmount part is supposed to be done in the cni/ns package, but the "mounted"
// flag is not updated when retrieving NetNs handler from GetNS().
if mounted {
if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil {
return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err)
}
if err := os.RemoveAll(netNSPath); err != nil {
return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err)
}
}
return nil
}
func createVirtualNetworkEndpoint(idx int, ifName string, interworkingModel NetInterworkingModel) (*VirtualEndpoint, error) {
if idx < 0 {
return &VirtualEndpoint{}, fmt.Errorf("invalid network endpoint index: %d", idx)
}
uniqueID := uuid.Generate().String()
hardAddr := net.HardwareAddr{0x02, 0x00, 0xCA, 0xFE, byte(idx >> 8), byte(idx)}
endpoint := &VirtualEndpoint{
// TODO This is too specific. We may need to create multiple
// end point types here and then decide how to connect them
// at the time of hypervisor attach and not here
NetPair: NetworkInterfacePair{
ID: uniqueID,
Name: fmt.Sprintf("br%d", idx),
VirtIface: NetworkInterface{
Name: fmt.Sprintf("eth%d", idx),
HardAddr: hardAddr.String(),
},
TAPIface: NetworkInterface{
Name: fmt.Sprintf("tap%d", idx),
},
NetInterworkingModel: interworkingModel,
},
EndpointType: VirtualEndpointType,
}
if ifName != "" {
endpoint.NetPair.VirtIface.Name = ifName
}
return endpoint, nil
}
func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) {
addrs, err := handle.AddrList(link, netlink.FAMILY_ALL)
if err != nil {
return NetworkInfo{}, err
}
routes, err := handle.RouteList(link, netlink.FAMILY_ALL)
if err != nil {
return NetworkInfo{}, err
}
return NetworkInfo{
Iface: NetlinkIface{
LinkAttrs: *(link.Attrs()),
Type: link.Type(),
},
Addrs: addrs,
Routes: routes,
}, nil
}
func createEndpointsFromScan(networkNSPath string, config NetworkConfig) ([]Endpoint, error) {
var endpoints []Endpoint
netnsHandle, err := netns.GetFromPath(networkNSPath)
if err != nil {
return []Endpoint{}, err
}
defer netnsHandle.Close()
netlinkHandle, err := netlink.NewHandleAt(netnsHandle)
if err != nil {
return []Endpoint{}, err
}
defer netlinkHandle.Delete()
linkList, err := netlinkHandle.LinkList()
if err != nil {
return []Endpoint{}, err
}
idx := 0
for _, link := range linkList {
var endpoint Endpoint
netInfo, err := networkInfoFromLink(netlinkHandle, link)
if err != nil {
return []Endpoint{}, err
}
// Ignore unconfigured network interfaces. These are
// either base tunnel devices that are not namespaced
// like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly
// setup interfaces.
if len(netInfo.Addrs) == 0 {
continue
}
// Skip any loopback interfaces:
if (netInfo.Iface.Flags & net.FlagLoopback) != 0 {
continue
}
if err := doNetNS(networkNSPath, func(_ ns.NetNS) error {
// TODO: This is the incoming interface
// based on the incoming interface we should create
// an appropriate EndPoint based on interface type
// This should be a switch
// Check if interface is a physical interface. Do not create
// tap interface/bridge if it is.
isPhysical, err := isPhysicalIface(netInfo.Iface.Name)
if err != nil {
return err
}
if isPhysical {
cnmLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found")
endpoint, err = createPhysicalEndpoint(netInfo)
} else {
var socketPath string
// Check if this is a dummy interface which has a vhost-user socket associated with it
socketPath, err = vhostUserSocketPath(netInfo)
if err != nil {
return err
}
if socketPath != "" {
cnmLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found")
endpoint, err = createVhostUserEndpoint(netInfo, socketPath)
} else {
endpoint, err = createVirtualNetworkEndpoint(idx, netInfo.Iface.Name, config.InterworkingModel)
}
}
return err
}); err != nil {
return []Endpoint{}, err
}
endpoint.SetProperties(netInfo)
endpoints = append(endpoints, endpoint)
idx++
}
return endpoints, nil
}
// isPhysicalIface checks if an interface is a physical device.
// We use ethtool here to not rely on device sysfs inside the network namespace.
func isPhysicalIface(ifaceName string) (bool, error) {
if ifaceName == "lo" {
return false, nil
}
ethHandle, err := ethtool.NewEthtool()
if err != nil {
return false, err
}
bus, err := ethHandle.BusInfo(ifaceName)
if err != nil {
return false, nil
}
// Check for a pci bus format
tokens := strings.Split(bus, ":")
if len(tokens) != 3 {
return false, nil
}
return true, nil
}
var sysPCIDevicesPath = "/sys/bus/pci/devices"
func createPhysicalEndpoint(netInfo NetworkInfo) (*PhysicalEndpoint, error) {
// Get ethtool handle to derive driver and bus
ethHandle, err := ethtool.NewEthtool()
if err != nil {
return nil, err
}
// Get BDF
bdf, err := ethHandle.BusInfo(netInfo.Iface.Name)
if err != nil {
return nil, err
}
// Get Driver
driver, err := ethHandle.DriverName(netInfo.Iface.Name)
if err != nil {
return nil, err
}
// Get vendor and device id from pci space (sys/bus/pci/devices/$bdf)
ifaceDevicePath := filepath.Join(sysPCIDevicesPath, bdf, "device")
contents, err := ioutil.ReadFile(ifaceDevicePath)
if err != nil {
return nil, err
}
deviceID := strings.TrimSpace(string(contents))
// Vendor id
ifaceVendorPath := filepath.Join(sysPCIDevicesPath, bdf, "vendor")
contents, err = ioutil.ReadFile(ifaceVendorPath)
if err != nil {
return nil, err
}
vendorID := strings.TrimSpace(string(contents))
vendorDeviceID := fmt.Sprintf("%s %s", vendorID, deviceID)
vendorDeviceID = strings.TrimSpace(vendorDeviceID)
physicalEndpoint := &PhysicalEndpoint{
IfaceName: netInfo.Iface.Name,
HardAddr: netInfo.Iface.HardwareAddr.String(),
VendorDeviceID: vendorDeviceID,
EndpointType: PhysicalEndpointType,
Driver: driver,
BDF: bdf,
}
return physicalEndpoint, nil
}
func bindNICToVFIO(endpoint *PhysicalEndpoint) error {
return bindDevicetoVFIO(endpoint.BDF, endpoint.Driver, endpoint.VendorDeviceID)
}
func bindNICToHost(endpoint *PhysicalEndpoint) error {
return bindDevicetoHost(endpoint.BDF, endpoint.Driver, endpoint.VendorDeviceID)
}
// network is the virtcontainers network interface.
// Container network plugins are used to setup virtual network
// between VM netns and the host network physical interface.
type network interface {
// init initializes the network, setting a new network namespace.
init(config NetworkConfig) (string, bool, error)
// run runs a callback function in a specified network namespace.
run(networkNSPath string, cb func() error) error
// add adds all needed interfaces inside the network namespace.
add(pod Pod, config NetworkConfig, netNsPath string, netNsCreated bool) (NetworkNamespace, error)
// remove unbridges and deletes TAP interfaces. It also removes virtual network
// interfaces and deletes the network namespace.
remove(pod Pod, networkNS NetworkNamespace) error
}