Files
kata-containers/virtcontainers/network.go
Archana Shinde 929c4e7e3d network: Change NewNS() call
NewNS() api has been moved to testutils package in the cni
plugin repo.

Signed-off-by: Archana Shinde <archana.m.shinde@intel.com>
2019-10-08 16:16:35 -07:00

1537 lines
41 KiB
Go

// Copyright (c) 2016 Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
//
package virtcontainers
import (
"context"
cryptoRand "crypto/rand"
"encoding/json"
"fmt"
"math/rand"
"net"
"os"
"runtime"
"sort"
"time"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/containernetworking/plugins/pkg/testutils"
opentracing "github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
"golang.org/x/sys/unix"
vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
"github.com/kata-containers/runtime/virtcontainers/pkg/uuid"
"github.com/kata-containers/runtime/virtcontainers/utils"
)
// NetInterworkingModel defines the network model connecting
// the network interface to the virtual machine.
type NetInterworkingModel int
const (
// NetXConnectDefaultModel Ask to use DefaultNetInterworkingModel
NetXConnectDefaultModel NetInterworkingModel = iota
// NetXConnectBridgedModel uses a linux bridge to interconnect
// the container interface to the VM. This is the
// safe default that works for most cases except
// macvlan and ipvlan
NetXConnectBridgedModel
// NetXConnectMacVtapModel can be used when the Container network
// interface can be bridged using macvtap
NetXConnectMacVtapModel
// NetXConnectEnlightenedModel can be used when the Network plugins
// are enlightened to create VM native interfaces
// when requested by the runtime
// This will be used for vethtap, macvtap, ipvtap
NetXConnectEnlightenedModel
// NetXConnectTCFilterModel redirects traffic from the network interface
// provided by the network plugin to a tap interface.
// This works for ipvlan and macvlan as well.
NetXConnectTCFilterModel
// NetXConnectNoneModel can be used when the VM is in the host network namespace
NetXConnectNoneModel
// NetXConnectInvalidModel is the last item to check valid values by IsValid()
NetXConnectInvalidModel
)
//IsValid checks if a model is valid
func (n NetInterworkingModel) IsValid() bool {
return 0 <= int(n) && int(n) < int(NetXConnectInvalidModel)
}
const (
defaultNetModelStr = "default"
bridgedNetModelStr = "bridged"
macvtapNetModelStr = "macvtap"
enlightenedNetModelStr = "enlightened"
tcFilterNetModelStr = "tcfilter"
noneNetModelStr = "none"
)
//SetModel change the model string value
func (n *NetInterworkingModel) SetModel(modelName string) error {
switch modelName {
case defaultNetModelStr:
*n = DefaultNetInterworkingModel
return nil
case bridgedNetModelStr:
*n = NetXConnectBridgedModel
return nil
case macvtapNetModelStr:
*n = NetXConnectMacVtapModel
return nil
case enlightenedNetModelStr:
*n = NetXConnectEnlightenedModel
return nil
case tcFilterNetModelStr:
*n = NetXConnectTCFilterModel
return nil
case noneNetModelStr:
*n = NetXConnectNoneModel
return nil
}
return fmt.Errorf("Unknown type %s", modelName)
}
// DefaultNetInterworkingModel is a package level default
// that determines how the VM should be connected to the
// the container network interface
var DefaultNetInterworkingModel = NetXConnectTCFilterModel
// Introduces constants related to networking
const (
defaultFilePerms = 0600
defaultQlen = 1500
)
// DNSInfo describes the DNS setup related to a network interface.
type DNSInfo struct {
Servers []string
Domain string
Searches []string
Options []string
}
// NetlinkIface describes fully a network interface.
type NetlinkIface struct {
netlink.LinkAttrs
Type string
}
// NetworkInfo gathers all information related to a network interface.
// It can be used to store the description of the underlying network.
type NetworkInfo struct {
Iface NetlinkIface
Addrs []netlink.Addr
Routes []netlink.Route
DNS DNSInfo
}
// NetworkInterface defines a network interface.
type NetworkInterface struct {
Name string
HardAddr string
Addrs []netlink.Addr
}
// TapInterface defines a tap interface
type TapInterface struct {
ID string
Name string
TAPIface NetworkInterface
VMFds []*os.File
VhostFds []*os.File
}
// TuntapInterface defines a tap interface
type TuntapInterface struct {
Name string
TAPIface NetworkInterface
}
// NetworkInterfacePair defines a pair between VM and virtual network interfaces.
type NetworkInterfacePair struct {
TapInterface
VirtIface NetworkInterface
NetInterworkingModel
}
// NetworkConfig is the network configuration related to a network.
type NetworkConfig struct {
NetNSPath string
NetNsCreated bool
DisableNewNetNs bool
NetmonConfig NetmonConfig
InterworkingModel NetInterworkingModel
}
func networkLogger() *logrus.Entry {
return virtLog.WithField("subsystem", "network")
}
// NetworkNamespace contains all data related to its network namespace.
type NetworkNamespace struct {
NetNsPath string
NetNsCreated bool
Endpoints []Endpoint
NetmonPID int
}
// TypedJSONEndpoint is used as an intermediate representation for
// marshalling and unmarshalling Endpoint objects.
type TypedJSONEndpoint struct {
Type EndpointType
Data json.RawMessage
}
// MarshalJSON is the custom NetworkNamespace JSON marshalling routine.
// This is needed to properly marshall Endpoints array.
func (n NetworkNamespace) MarshalJSON() ([]byte, error) {
// We need a shadow structure in order to prevent json from
// entering a recursive loop when only calling json.Marshal().
type shadow struct {
NetNsPath string
NetNsCreated bool
Endpoints []TypedJSONEndpoint
}
s := &shadow{
NetNsPath: n.NetNsPath,
NetNsCreated: n.NetNsCreated,
}
var typedEndpoints []TypedJSONEndpoint
for _, endpoint := range n.Endpoints {
tempJSON, _ := json.Marshal(endpoint)
t := TypedJSONEndpoint{
Type: endpoint.Type(),
Data: tempJSON,
}
typedEndpoints = append(typedEndpoints, t)
}
s.Endpoints = typedEndpoints
b, err := json.Marshal(s)
return b, err
}
func generateEndpoints(typedEndpoints []TypedJSONEndpoint) ([]Endpoint, error) {
var endpoints []Endpoint
for _, e := range typedEndpoints {
var endpointInf Endpoint
switch e.Type {
case PhysicalEndpointType:
var endpoint PhysicalEndpoint
endpointInf = &endpoint
case VethEndpointType:
var endpoint VethEndpoint
endpointInf = &endpoint
case VhostUserEndpointType:
var endpoint VhostUserEndpoint
endpointInf = &endpoint
case BridgedMacvlanEndpointType:
var endpoint BridgedMacvlanEndpoint
endpointInf = &endpoint
case MacvtapEndpointType:
var endpoint MacvtapEndpoint
endpointInf = &endpoint
case TapEndpointType:
var endpoint TapEndpoint
endpointInf = &endpoint
case IPVlanEndpointType:
var endpoint IPVlanEndpoint
endpointInf = &endpoint
case TuntapEndpointType:
var endpoint TuntapEndpoint
endpointInf = &endpoint
default:
networkLogger().WithField("endpoint-type", e.Type).Error("Ignoring unknown endpoint type")
}
err := json.Unmarshal(e.Data, endpointInf)
if err != nil {
return nil, err
}
endpoints = append(endpoints, endpointInf)
networkLogger().WithFields(logrus.Fields{
"endpoint": endpointInf,
"endpoint-type": e.Type,
}).Info("endpoint unmarshalled")
}
return endpoints, nil
}
// UnmarshalJSON is the custom NetworkNamespace unmarshalling routine.
// This is needed for unmarshalling the Endpoints interfaces array.
func (n *NetworkNamespace) UnmarshalJSON(b []byte) error {
var s struct {
NetNsPath string
NetNsCreated bool
Endpoints json.RawMessage
}
if err := json.Unmarshal(b, &s); err != nil {
return err
}
(*n).NetNsPath = s.NetNsPath
(*n).NetNsCreated = s.NetNsCreated
var typedEndpoints []TypedJSONEndpoint
if err := json.Unmarshal([]byte(string(s.Endpoints)), &typedEndpoints); err != nil {
return err
}
endpoints, err := generateEndpoints(typedEndpoints)
if err != nil {
return err
}
(*n).Endpoints = endpoints
return nil
}
func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link, queues int) (netlink.Link, []*os.File, error) {
var newLink netlink.Link
var fds []*os.File
switch expectedLink.Type() {
case (&netlink.Bridge{}).Type():
newLink = &netlink.Bridge{
LinkAttrs: netlink.LinkAttrs{Name: name},
MulticastSnooping: expectedLink.(*netlink.Bridge).MulticastSnooping,
}
case (&netlink.Tuntap{}).Type():
flags := netlink.TUNTAP_VNET_HDR
if queues > 0 {
flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS
}
newLink = &netlink.Tuntap{
LinkAttrs: netlink.LinkAttrs{Name: name},
Mode: netlink.TUNTAP_MODE_TAP,
Queues: queues,
Flags: flags,
}
case (&netlink.Macvtap{}).Type():
qlen := expectedLink.Attrs().TxQLen
if qlen <= 0 {
qlen = defaultQlen
}
newLink = &netlink.Macvtap{
Macvlan: netlink.Macvlan{
Mode: netlink.MACVLAN_MODE_BRIDGE,
LinkAttrs: netlink.LinkAttrs{
Index: expectedLink.Attrs().Index,
Name: name,
TxQLen: qlen,
ParentIndex: expectedLink.Attrs().ParentIndex,
},
},
}
default:
return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type())
}
if err := netHandle.LinkAdd(newLink); err != nil {
return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err)
}
tuntapLink, ok := newLink.(*netlink.Tuntap)
if ok {
fds = tuntapLink.Fds
}
newLink, err := getLinkByName(netHandle, name, expectedLink)
return newLink, fds, err
}
func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.Link, error) {
var link netlink.Link
switch ep := endpoint.(type) {
case *VethEndpoint:
link = &netlink.Veth{}
case *BridgedMacvlanEndpoint:
link = &netlink.Macvlan{}
case *IPVlanEndpoint:
link = &netlink.IPVlan{}
case *TuntapEndpoint:
link = &netlink.Tuntap{}
default:
return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type())
}
return getLinkByName(netHandle, endpoint.NetworkPair().VirtIface.Name, link)
}
func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) {
link, err := netHandle.LinkByName(name)
if err != nil {
return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err)
}
switch expectedLink.Type() {
case (&netlink.Bridge{}).Type():
if l, ok := link.(*netlink.Bridge); ok {
return l, nil
}
case (&netlink.Tuntap{}).Type():
if l, ok := link.(*netlink.Tuntap); ok {
return l, nil
}
case (&netlink.Veth{}).Type():
if l, ok := link.(*netlink.Veth); ok {
return l, nil
}
case (&netlink.Macvtap{}).Type():
if l, ok := link.(*netlink.Macvtap); ok {
return l, nil
}
case (&netlink.Macvlan{}).Type():
if l, ok := link.(*netlink.Macvlan); ok {
return l, nil
}
case (&netlink.IPVlan{}).Type():
if l, ok := link.(*netlink.IPVlan); ok {
return l, nil
}
default:
return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type())
}
return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type())
}
// The endpoint type should dictate how the connection needs to happen.
func xConnectVMNetwork(endpoint Endpoint, h hypervisor) error {
netPair := endpoint.NetworkPair()
queues := 0
caps := h.capabilities()
if caps.IsMultiQueueSupported() {
queues = int(h.hypervisorConfig().NumVCPUs)
}
disableVhostNet := h.hypervisorConfig().DisableVhostNet
if netPair.NetInterworkingModel == NetXConnectDefaultModel {
netPair.NetInterworkingModel = DefaultNetInterworkingModel
}
switch netPair.NetInterworkingModel {
case NetXConnectBridgedModel:
return bridgeNetworkPair(endpoint, queues, disableVhostNet)
case NetXConnectMacVtapModel:
return tapNetworkPair(endpoint, queues, disableVhostNet)
case NetXConnectTCFilterModel:
return setupTCFiltering(endpoint, queues, disableVhostNet)
case NetXConnectEnlightenedModel:
return fmt.Errorf("Unsupported networking model")
default:
return fmt.Errorf("Invalid internetworking model")
}
}
// The endpoint type should dictate how the disconnection needs to happen.
func xDisconnectVMNetwork(endpoint Endpoint) error {
netPair := endpoint.NetworkPair()
if netPair.NetInterworkingModel == NetXConnectDefaultModel {
netPair.NetInterworkingModel = DefaultNetInterworkingModel
}
switch netPair.NetInterworkingModel {
case NetXConnectBridgedModel:
return unBridgeNetworkPair(endpoint)
case NetXConnectMacVtapModel:
return untapNetworkPair(endpoint)
case NetXConnectTCFilterModel:
return removeTCFiltering(endpoint)
case NetXConnectEnlightenedModel:
return fmt.Errorf("Unsupported networking model")
default:
return fmt.Errorf("Invalid internetworking model")
}
}
func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) {
tapDev := fmt.Sprintf("/dev/tap%d", linkIndex)
return createFds(tapDev, queues)
}
func createVhostFds(numFds int) ([]*os.File, error) {
vhostDev := "/dev/vhost-net"
return createFds(vhostDev, numFds)
}
func createFds(device string, numFds int) ([]*os.File, error) {
fds := make([]*os.File, numFds)
for i := 0; i < numFds; i++ {
f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms)
if err != nil {
utils.CleanupFds(fds, i)
return nil, err
}
fds[i] = f
}
return fds, nil
}
// There is a limitation in the linux kernel that prevents a macvtap/macvlan link
// from getting the correct link index when created in a network namespace
// https://github.com/clearcontainers/runtime/issues/708
//
// Till that bug is fixed we need to pick a random non conflicting index and try to
// create a link. If that fails, we need to try with another.
// All the kernel does not check if the link id conflicts with a link id on the host
// hence we need to offset the link id to prevent any overlaps with the host index
//
// Here the kernel will ensure that there is no race condition
const hostLinkOffset = 8192 // Host should not have more than 8k interfaces
const linkRange = 0xFFFF // This will allow upto 2^16 containers
const linkRetries = 128 // The numbers of time we try to find a non conflicting index
const macvtapWorkaround = true
func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link, queues int) (taplink netlink.Link, err error) {
if !macvtapWorkaround {
taplink, _, err = createLink(netHandle, name, link, queues)
return
}
r := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := 0; i < linkRetries; i++ {
index := hostLinkOffset + (r.Int() & linkRange)
link.Attrs().Index = index
taplink, _, err = createLink(netHandle, name, link, queues)
if err == nil {
break
}
}
return
}
func clearIPs(link netlink.Link, addrs []netlink.Addr) error {
for _, addr := range addrs {
if err := netlink.AddrDel(link, &addr); err != nil {
return err
}
}
return nil
}
func setIPs(link netlink.Link, addrs []netlink.Addr) error {
for _, addr := range addrs {
if err := netlink.AddrAdd(link, &addr); err != nil {
return err
}
}
return nil
}
func tapNetworkPair(endpoint Endpoint, queues int, disableVhostNet bool) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
netPair := endpoint.NetworkPair()
link, err := getLinkForEndpoint(endpoint, netHandle)
if err != nil {
return err
}
attrs := link.Attrs()
// Attach the macvtap interface to the underlying container
// interface. Also picks relevant attributes from the parent
tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name,
&netlink.Macvtap{
Macvlan: netlink.Macvlan{
LinkAttrs: netlink.LinkAttrs{
TxQLen: attrs.TxQLen,
ParentIndex: attrs.Index,
},
},
}, queues)
if err != nil {
return fmt.Errorf("Could not create TAP interface: %s", err)
}
// Save the veth MAC address to the TAP so that it can later be used
// to build the hypervisor command line. This MAC address has to be
// the one inside the VM in order to avoid any firewall issues. The
// bridge created by the network plugin on the host actually expects
// to see traffic from this MAC address and not another one.
tapHardAddr := attrs.HardwareAddr
netPair.TAPIface.HardAddr = attrs.HardwareAddr.String()
if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil {
return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err)
}
hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr)
if err != nil {
return err
}
if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetUp(tapLink); err != nil {
return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err)
}
// Clear the IP addresses from the veth interface to prevent ARP conflict
netPair.VirtIface.Addrs, err = netlink.AddrList(link, netlink.FAMILY_V4)
if err != nil {
return fmt.Errorf("Unable to obtain veth IP addresses: %s", err)
}
if err := clearIPs(link, netPair.VirtIface.Addrs); err != nil {
return fmt.Errorf("Unable to clear veth IP addresses: %s", err)
}
if err := netHandle.LinkSetUp(link); err != nil {
return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err)
}
// Note: The underlying interfaces need to be up prior to fd creation.
netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, queues)
if err != nil {
return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err)
}
if !disableVhostNet {
vhostFds, err := createVhostFds(queues)
if err != nil {
return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err)
}
netPair.VhostFds = vhostFds
}
return nil
}
func bridgeNetworkPair(endpoint Endpoint, queues int, disableVhostNet bool) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
netPair := endpoint.NetworkPair()
tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}, queues)
if err != nil {
return fmt.Errorf("Could not create TAP interface: %s", err)
}
netPair.VMFds = fds
if !disableVhostNet {
vhostFds, err := createVhostFds(queues)
if err != nil {
return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err)
}
netPair.VhostFds = vhostFds
}
var attrs *netlink.LinkAttrs
var link netlink.Link
link, err = getLinkForEndpoint(endpoint, netHandle)
if err != nil {
return err
}
attrs = link.Attrs()
// Save the veth MAC address to the TAP so that it can later be used
// to build the hypervisor command line. This MAC address has to be
// the one inside the VM in order to avoid any firewall issues. The
// bridge created by the network plugin on the host actually expects
// to see traffic from this MAC address and not another one.
netPair.TAPIface.HardAddr = attrs.HardwareAddr.String()
if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil {
return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err)
}
hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr)
if err != nil {
return err
}
if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
mcastSnoop := false
bridgeLink, _, err := createLink(netHandle, netPair.Name, &netlink.Bridge{MulticastSnooping: &mcastSnoop}, queues)
if err != nil {
return fmt.Errorf("Could not create bridge: %s", err)
}
if err := netHandle.LinkSetMaster(tapLink, bridgeLink.(*netlink.Bridge)); err != nil {
return fmt.Errorf("Could not attach TAP %s to the bridge %s: %s",
netPair.TAPIface.Name, netPair.Name, err)
}
if err := netHandle.LinkSetUp(tapLink); err != nil {
return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkSetMaster(link, bridgeLink.(*netlink.Bridge)); err != nil {
return fmt.Errorf("Could not attach veth %s to the bridge %s: %s",
netPair.VirtIface.Name, netPair.Name, err)
}
// Clear the IP addresses from the veth interface to prevent ARP conflict
netPair.VirtIface.Addrs, err = netlink.AddrList(link, netlink.FAMILY_V4)
if err != nil {
return fmt.Errorf("Unable to obtain veth IP addresses: %s", err)
}
if err := clearIPs(link, netPair.VirtIface.Addrs); err != nil {
return fmt.Errorf("Unable to clear veth IP addresses: %s", err)
}
if err := netHandle.LinkSetUp(link); err != nil {
return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetUp(bridgeLink); err != nil {
return fmt.Errorf("Could not enable bridge %s: %s", netPair.Name, err)
}
return nil
}
func setupTCFiltering(endpoint Endpoint, queues int, disableVhostNet bool) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
netPair := endpoint.NetworkPair()
tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}, queues)
if err != nil {
return fmt.Errorf("Could not create TAP interface: %s", err)
}
netPair.VMFds = fds
if !disableVhostNet {
vhostFds, err := createVhostFds(queues)
if err != nil {
return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err)
}
netPair.VhostFds = vhostFds
}
var attrs *netlink.LinkAttrs
var link netlink.Link
link, err = getLinkForEndpoint(endpoint, netHandle)
if err != nil {
return err
}
attrs = link.Attrs()
// Save the veth MAC address to the TAP so that it can later be used
// to build the hypervisor command line. This MAC address has to be
// the one inside the VM in order to avoid any firewall issues. The
// bridge created by the network plugin on the host actually expects
// to see traffic from this MAC address and not another one.
netPair.TAPIface.HardAddr = attrs.HardwareAddr.String()
if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil {
return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err)
}
if err := netHandle.LinkSetUp(tapLink); err != nil {
return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err)
}
tapAttrs := tapLink.Attrs()
if err := addQdiscIngress(tapAttrs.Index); err != nil {
return err
}
if err := addQdiscIngress(attrs.Index); err != nil {
return err
}
if err := addRedirectTCFilter(attrs.Index, tapAttrs.Index); err != nil {
return err
}
if err := addRedirectTCFilter(tapAttrs.Index, attrs.Index); err != nil {
return err
}
return nil
}
// addQdiscIngress creates a new qdisc for nwtwork interface with the specified network index
// on "ingress". qdiscs normally don't work on ingress so this is really a special qdisc
// that you can consider an "alternate root" for inbound packets.
// Handle for ingress qdisc defaults to "ffff:"
//
// This is equivalent to calling `tc qdisc add dev eth0 ingress`
func addQdiscIngress(index int) error {
qdisc := &netlink.Ingress{
QdiscAttrs: netlink.QdiscAttrs{
LinkIndex: index,
Parent: netlink.HANDLE_INGRESS,
},
}
err := netlink.QdiscAdd(qdisc)
if err != nil {
return fmt.Errorf("Failed to add qdisc for network index %d : %s", index, err)
}
return nil
}
// addRedirectTCFilter adds a tc filter for device with index "sourceIndex".
// All traffic for interface with index "sourceIndex" is redirected to interface with
// index "destIndex"
//
// This is equivalent to calling:
// `tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev dest`
func addRedirectTCFilter(sourceIndex, destIndex int) error {
filter := &netlink.U32{
FilterAttrs: netlink.FilterAttrs{
LinkIndex: sourceIndex,
Parent: netlink.MakeHandle(0xffff, 0),
Protocol: unix.ETH_P_ALL,
},
Actions: []netlink.Action{
&netlink.MirredAction{
ActionAttrs: netlink.ActionAttrs{
Action: netlink.TC_ACT_STOLEN,
},
MirredAction: netlink.TCA_EGRESS_REDIR,
Ifindex: destIndex,
},
},
}
if err := netlink.FilterAdd(filter); err != nil {
return fmt.Errorf("Failed to add filter for index %d : %s", sourceIndex, err)
}
return nil
}
// removeRedirectTCFilter removes all tc u32 filters created on ingress qdisc for "link".
func removeRedirectTCFilter(link netlink.Link) error {
if link == nil {
return nil
}
// Handle 0xffff is used for ingress
filters, err := netlink.FilterList(link, netlink.MakeHandle(0xffff, 0))
if err != nil {
return err
}
for _, f := range filters {
u32, ok := f.(*netlink.U32)
if !ok {
continue
}
if err := netlink.FilterDel(u32); err != nil {
return err
}
}
return nil
}
// removeQdiscIngress removes the ingress qdisc previously created on "link".
func removeQdiscIngress(link netlink.Link) error {
if link == nil {
return nil
}
qdiscs, err := netlink.QdiscList(link)
if err != nil {
return err
}
for _, qdisc := range qdiscs {
ingress, ok := qdisc.(*netlink.Ingress)
if !ok {
continue
}
if err := netlink.QdiscDel(ingress); err != nil {
return err
}
}
return nil
}
func untapNetworkPair(endpoint Endpoint) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
netPair := endpoint.NetworkPair()
tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{})
if err != nil {
return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkDel(tapLink); err != nil {
return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err)
}
link, err := getLinkForEndpoint(endpoint, netHandle)
if err != nil {
return err
}
hardAddr, err := net.ParseMAC(netPair.TAPIface.HardAddr)
if err != nil {
return err
}
if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetDown(link); err != nil {
return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err)
}
// Restore the IPs that were cleared
err = setIPs(link, netPair.VirtIface.Addrs)
return err
}
func unBridgeNetworkPair(endpoint Endpoint) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
netPair := endpoint.NetworkPair()
tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{})
if err != nil {
return fmt.Errorf("Could not get TAP interface: %s", err)
}
bridgeLink, err := getLinkByName(netHandle, netPair.Name, &netlink.Bridge{})
if err != nil {
return fmt.Errorf("Could not get bridge interface: %s", err)
}
if err := netHandle.LinkSetDown(bridgeLink); err != nil {
return fmt.Errorf("Could not disable bridge %s: %s", netPair.Name, err)
}
if err := netHandle.LinkSetDown(tapLink); err != nil {
return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkSetNoMaster(tapLink); err != nil {
return fmt.Errorf("Could not detach TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkDel(bridgeLink); err != nil {
return fmt.Errorf("Could not remove bridge %s: %s", netPair.Name, err)
}
if err := netHandle.LinkDel(tapLink); err != nil {
return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err)
}
link, err := getLinkForEndpoint(endpoint, netHandle)
if err != nil {
return err
}
hardAddr, err := net.ParseMAC(netPair.TAPIface.HardAddr)
if err != nil {
return err
}
if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil {
return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s",
netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetDown(link); err != nil {
return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err)
}
if err := netHandle.LinkSetNoMaster(link); err != nil {
return fmt.Errorf("Could not detach veth %s: %s", netPair.VirtIface.Name, err)
}
// Restore the IPs that were cleared
return setIPs(link, netPair.VirtIface.Addrs)
}
func removeTCFiltering(endpoint Endpoint) error {
netHandle, err := netlink.NewHandle()
if err != nil {
return err
}
defer netHandle.Delete()
netPair := endpoint.NetworkPair()
tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{})
if err != nil {
return fmt.Errorf("Could not get TAP interface: %s", err)
}
if err := netHandle.LinkSetDown(tapLink); err != nil {
return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err)
}
if err := netHandle.LinkDel(tapLink); err != nil {
return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err)
}
link, err := getLinkForEndpoint(endpoint, netHandle)
if err != nil {
return err
}
if err := removeRedirectTCFilter(link); err != nil {
return err
}
if err := removeQdiscIngress(link); err != nil {
return err
}
if err := netHandle.LinkSetDown(link); err != nil {
return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err)
}
return nil
}
func createNetNS() (string, error) {
n, err := testutils.NewNS()
if err != nil {
return "", err
}
return n.Path(), nil
}
// doNetNS is free from any call to a go routine, and it calls
// into runtime.LockOSThread(), meaning it won't be executed in a
// different thread than the one expected by the caller.
func doNetNS(netNSPath string, cb func(ns.NetNS) error) error {
// if netNSPath is empty, the callback function will be run in the current network namespace.
// So skip the whole function, just call cb(). cb() needs a NetNS as arg but ignored, give it a fake one.
if netNSPath == "" {
var netNs ns.NetNS
return cb(netNs)
}
runtime.LockOSThread()
defer runtime.UnlockOSThread()
currentNS, err := ns.GetCurrentNS()
if err != nil {
return err
}
defer currentNS.Close()
targetNS, err := ns.GetNS(netNSPath)
if err != nil {
return err
}
if err := targetNS.Set(); err != nil {
return err
}
defer currentNS.Set()
return cb(targetNS)
}
func deleteNetNS(netNSPath string) error {
n, err := ns.GetNS(netNSPath)
if err != nil {
return err
}
err = n.Close()
if err != nil {
return err
}
if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil {
return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err)
}
if err := os.RemoveAll(netNSPath); err != nil {
return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err)
}
return nil
}
func generateInterfacesAndRoutes(networkNS NetworkNamespace) ([]*vcTypes.Interface, []*vcTypes.Route, error) {
if networkNS.NetNsPath == "" {
return nil, nil, nil
}
var routes []*vcTypes.Route
var ifaces []*vcTypes.Interface
for _, endpoint := range networkNS.Endpoints {
var ipAddresses []*vcTypes.IPAddress
for _, addr := range endpoint.Properties().Addrs {
// Skip IPv6 because not supported
if addr.IP.To4() == nil {
// Skip IPv6 because not supported
networkLogger().WithFields(logrus.Fields{
"unsupported-address-type": "ipv6",
"address": addr,
}).Warn("unsupported address")
continue
}
// Skip localhost interface
if addr.IP.IsLoopback() {
continue
}
netMask, _ := addr.Mask.Size()
ipAddress := vcTypes.IPAddress{
Family: netlink.FAMILY_V4,
Address: addr.IP.String(),
Mask: fmt.Sprintf("%d", netMask),
}
ipAddresses = append(ipAddresses, &ipAddress)
}
noarp := endpoint.Properties().Iface.RawFlags & unix.IFF_NOARP
ifc := vcTypes.Interface{
IPAddresses: ipAddresses,
Device: endpoint.Name(),
Name: endpoint.Name(),
Mtu: uint64(endpoint.Properties().Iface.MTU),
RawFlags: noarp,
HwAddr: endpoint.HardwareAddr(),
PciAddr: endpoint.PciAddr(),
}
ifaces = append(ifaces, &ifc)
for _, route := range endpoint.Properties().Routes {
var r vcTypes.Route
if route.Protocol == unix.RTPROT_KERNEL {
continue
}
if route.Dst != nil {
r.Dest = route.Dst.String()
if route.Dst.IP.To4() == nil {
// Skip IPv6 because not supported
networkLogger().WithFields(logrus.Fields{
"unsupported-route-type": "ipv6",
"destination": r.Dest,
}).Warn("unsupported route")
continue
}
}
if route.Gw != nil {
gateway := route.Gw.String()
if route.Gw.To4() == nil {
// Skip IPv6 because is is not supported
networkLogger().WithFields(logrus.Fields{
"unsupported-route-type": "ipv6",
"gateway": gateway,
}).Warn("unsupported route")
continue
}
r.Gateway = gateway
}
if route.Src != nil {
r.Source = route.Src.String()
}
r.Device = endpoint.Name()
r.Scope = uint32(route.Scope)
routes = append(routes, &r)
}
}
return ifaces, routes, nil
}
func createNetworkInterfacePair(idx int, ifName string, interworkingModel NetInterworkingModel) (NetworkInterfacePair, error) {
uniqueID := uuid.Generate().String()
randomMacAddr, err := generateRandomPrivateMacAddr()
if err != nil {
return NetworkInterfacePair{}, fmt.Errorf("Could not generate random mac address: %s", err)
}
netPair := NetworkInterfacePair{
TapInterface: TapInterface{
ID: uniqueID,
Name: fmt.Sprintf("br%d_kata", idx),
TAPIface: NetworkInterface{
Name: fmt.Sprintf("tap%d_kata", idx),
},
},
VirtIface: NetworkInterface{
Name: fmt.Sprintf("eth%d", idx),
HardAddr: randomMacAddr,
},
NetInterworkingModel: interworkingModel,
}
if ifName != "" {
netPair.VirtIface.Name = ifName
}
return netPair, nil
}
func generateRandomPrivateMacAddr() (string, error) {
buf := make([]byte, 6)
_, err := cryptoRand.Read(buf)
if err != nil {
return "", err
}
// Set the local bit for local addresses
// Addresses in this range are local mac addresses:
// x2-xx-xx-xx-xx-xx , x6-xx-xx-xx-xx-xx , xA-xx-xx-xx-xx-xx , xE-xx-xx-xx-xx-xx
buf[0] = (buf[0] | 2) & 0xfe
hardAddr := net.HardwareAddr(buf)
return hardAddr.String(), nil
}
func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) {
addrs, err := handle.AddrList(link, netlink.FAMILY_ALL)
if err != nil {
return NetworkInfo{}, err
}
routes, err := handle.RouteList(link, netlink.FAMILY_ALL)
if err != nil {
return NetworkInfo{}, err
}
return NetworkInfo{
Iface: NetlinkIface{
LinkAttrs: *(link.Attrs()),
Type: link.Type(),
},
Addrs: addrs,
Routes: routes,
}, nil
}
func createEndpointsFromScan(networkNSPath string, config *NetworkConfig) ([]Endpoint, error) {
var endpoints []Endpoint
netnsHandle, err := netns.GetFromPath(networkNSPath)
if err != nil {
return []Endpoint{}, err
}
defer netnsHandle.Close()
netlinkHandle, err := netlink.NewHandleAt(netnsHandle)
if err != nil {
return []Endpoint{}, err
}
defer netlinkHandle.Delete()
linkList, err := netlinkHandle.LinkList()
if err != nil {
return []Endpoint{}, err
}
idx := 0
for _, link := range linkList {
var (
endpoint Endpoint
errCreate error
)
netInfo, err := networkInfoFromLink(netlinkHandle, link)
if err != nil {
return []Endpoint{}, err
}
// Ignore unconfigured network interfaces. These are
// either base tunnel devices that are not namespaced
// like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly
// setup interfaces.
if len(netInfo.Addrs) == 0 {
continue
}
// Skip any loopback interfaces:
if (netInfo.Iface.Flags & net.FlagLoopback) != 0 {
continue
}
if err := doNetNS(networkNSPath, func(_ ns.NetNS) error {
endpoint, errCreate = createEndpoint(netInfo, idx, config.InterworkingModel, link)
return errCreate
}); err != nil {
return []Endpoint{}, err
}
endpoint.SetProperties(netInfo)
endpoints = append(endpoints, endpoint)
idx++
}
sort.Slice(endpoints, func(i, j int) bool {
return endpoints[i].Name() < endpoints[j].Name()
})
networkLogger().WithField("endpoints", endpoints).Info("Endpoints found after scan")
return endpoints, nil
}
func createEndpoint(netInfo NetworkInfo, idx int, model NetInterworkingModel, link netlink.Link) (Endpoint, error) {
var endpoint Endpoint
// TODO: This is the incoming interface
// based on the incoming interface we should create
// an appropriate EndPoint based on interface type
// This should be a switch
// Check if interface is a physical interface. Do not create
// tap interface/bridge if it is.
isPhysical, err := isPhysicalIface(netInfo.Iface.Name)
if err != nil {
return nil, err
}
if isPhysical {
networkLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found")
endpoint, err = createPhysicalEndpoint(netInfo)
} else {
var socketPath string
// Check if this is a dummy interface which has a vhost-user socket associated with it
socketPath, err = vhostUserSocketPath(netInfo)
if err != nil {
return nil, err
}
if socketPath != "" {
networkLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found")
endpoint, err = createVhostUserEndpoint(netInfo, socketPath)
} else if netInfo.Iface.Type == "macvlan" {
networkLogger().Infof("macvlan interface found")
endpoint, err = createBridgedMacvlanNetworkEndpoint(idx, netInfo.Iface.Name, model)
} else if netInfo.Iface.Type == "macvtap" {
networkLogger().Infof("macvtap interface found")
endpoint, err = createMacvtapNetworkEndpoint(netInfo)
} else if netInfo.Iface.Type == "tap" {
networkLogger().Info("tap interface found")
endpoint, err = createTapNetworkEndpoint(idx, netInfo.Iface.Name)
} else if netInfo.Iface.Type == "tuntap" {
if link != nil {
switch link.(*netlink.Tuntap).Mode {
case 0:
// mount /sys/class/net to get links
return nil, fmt.Errorf("Network device mode not determined correctly. Mount sysfs in caller")
case 1:
return nil, fmt.Errorf("tun networking device not yet supported")
case 2:
networkLogger().Info("tuntap tap interface found")
endpoint, err = createTuntapNetworkEndpoint(idx, netInfo.Iface.Name, netInfo.Iface.HardwareAddr, model)
default:
return nil, fmt.Errorf("tuntap network %v mode unsupported", link.(*netlink.Tuntap).Mode)
}
}
} else if netInfo.Iface.Type == "veth" {
endpoint, err = createVethNetworkEndpoint(idx, netInfo.Iface.Name, model)
} else if netInfo.Iface.Type == "ipvlan" {
endpoint, err = createIPVlanNetworkEndpoint(idx, netInfo.Iface.Name)
} else {
return nil, fmt.Errorf("Unsupported network interface: %s", netInfo.Iface.Type)
}
}
return endpoint, err
}
// Network is the virtcontainer network structure
type Network struct {
}
func (n *Network) trace(ctx context.Context, name string) (opentracing.Span, context.Context) {
span, ct := opentracing.StartSpanFromContext(ctx, name)
span.SetTag("subsystem", "network")
span.SetTag("type", "default")
return span, ct
}
// Run runs a callback in the specified network namespace.
func (n *Network) Run(networkNSPath string, cb func() error) error {
span, _ := n.trace(context.Background(), "run")
defer span.Finish()
return doNetNS(networkNSPath, func(_ ns.NetNS) error {
return cb()
})
}
// Add adds all needed interfaces inside the network namespace.
func (n *Network) Add(ctx context.Context, config *NetworkConfig, hypervisor hypervisor, hotplug bool) ([]Endpoint, error) {
span, _ := n.trace(ctx, "add")
defer span.Finish()
endpoints, err := createEndpointsFromScan(config.NetNSPath, config)
if err != nil {
return endpoints, err
}
err = doNetNS(config.NetNSPath, func(_ ns.NetNS) error {
for _, endpoint := range endpoints {
networkLogger().WithField("endpoint-type", endpoint.Type()).WithField("hotplug", hotplug).Info("Attaching endpoint")
if hotplug {
if err := endpoint.HotAttach(hypervisor); err != nil {
return err
}
} else {
if err := endpoint.Attach(hypervisor); err != nil {
return err
}
}
}
return nil
})
if err != nil {
return []Endpoint{}, err
}
networkLogger().Debug("Network added")
return endpoints, nil
}
func (n *Network) PostAdd(ctx context.Context, ns *NetworkNamespace, hotplug bool) error {
if hotplug {
return nil
}
if ns.Endpoints == nil {
return nil
}
endpoints := ns.Endpoints
for _, endpoint := range endpoints {
netPair := endpoint.NetworkPair()
if netPair == nil {
continue
}
if netPair.VhostFds != nil {
for _, VhostFd := range netPair.VhostFds {
VhostFd.Close()
}
}
}
return nil
}
// Remove network endpoints in the network namespace. It also deletes the network
// namespace in case the namespace has been created by us.
func (n *Network) Remove(ctx context.Context, ns *NetworkNamespace, hypervisor hypervisor) error {
span, _ := n.trace(ctx, "remove")
defer span.Finish()
for _, endpoint := range ns.Endpoints {
// Detach for an endpoint should enter the network namespace
// if required.
networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Detaching endpoint")
if err := endpoint.Detach(ns.NetNsCreated, ns.NetNsPath); err != nil {
return err
}
}
networkLogger().Debug("Network removed")
if ns.NetNsCreated {
networkLogger().Infof("Network namespace %q deleted", ns.NetNsPath)
return deleteNetNS(ns.NetNsPath)
}
return nil
}