diff --git a/src/runtime/virtcontainers/acrn.go b/src/runtime/virtcontainers/acrn.go index d97cf2a0c8..244b78d66b 100644 --- a/src/runtime/virtcontainers/acrn.go +++ b/src/runtime/virtcontainers/acrn.go @@ -356,7 +356,7 @@ func (a *Acrn) setConfig(config *HypervisorConfig) error { } // CreateVM is the VM creation -func (a *Acrn) CreateVM(ctx context.Context, id string, network *Network, hypervisorConfig *HypervisorConfig) error { +func (a *Acrn) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { // Save the tracing context a.ctx = ctx diff --git a/src/runtime/virtcontainers/clh.go b/src/runtime/virtcontainers/clh.go index 5b48ae56b3..89249f3de0 100644 --- a/src/runtime/virtcontainers/clh.go +++ b/src/runtime/virtcontainers/clh.go @@ -200,7 +200,7 @@ func (clh *cloudHypervisor) setConfig(config *HypervisorConfig) error { // For cloudHypervisor this call only sets the internal structure up. // The VM will be created and started through StartVM(). -func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network *Network, hypervisorConfig *HypervisorConfig) error { +func (clh *cloudHypervisor) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { clh.ctx = ctx span, newCtx := katatrace.Trace(clh.ctx, clh.Logger(), "CreateVM", clhTracingTags, map[string]string{"sandbox_id": clh.id}) diff --git a/src/runtime/virtcontainers/fc.go b/src/runtime/virtcontainers/fc.go index 338a96463a..34a44692d7 100644 --- a/src/runtime/virtcontainers/fc.go +++ b/src/runtime/virtcontainers/fc.go @@ -199,7 +199,7 @@ func (fc *firecracker) setConfig(config *HypervisorConfig) error { // CreateVM For firecracker this call only sets the internal structure up. // The sandbox will be created and started through startSandbox(). -func (fc *firecracker) CreateVM(ctx context.Context, id string, network *Network, hypervisorConfig *HypervisorConfig) error { +func (fc *firecracker) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { fc.ctx = ctx span, _ := katatrace.Trace(ctx, fc.Logger(), "CreateVM", fcTracingTags, map[string]string{"sandbox_id": fc.id}) diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go index d16f82d8b3..1753420164 100644 --- a/src/runtime/virtcontainers/hypervisor.go +++ b/src/runtime/virtcontainers/hypervisor.go @@ -910,7 +910,7 @@ func generateVMSocket(id string, vmStogarePath string) (interface{}, error) { // hypervisor is the virtcontainers hypervisor interface. // The default hypervisor implementation is Qemu. type Hypervisor interface { - CreateVM(ctx context.Context, id string, network *Network, hypervisorConfig *HypervisorConfig) error + CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error StartVM(ctx context.Context, timeout int) error // If wait is set, don't actively stop the sandbox: diff --git a/src/runtime/virtcontainers/mock_hypervisor.go b/src/runtime/virtcontainers/mock_hypervisor.go index 635cbbe1d3..83e776d289 100644 --- a/src/runtime/virtcontainers/mock_hypervisor.go +++ b/src/runtime/virtcontainers/mock_hypervisor.go @@ -38,7 +38,7 @@ func (m *mockHypervisor) setConfig(config *HypervisorConfig) error { return nil } -func (m *mockHypervisor) CreateVM(ctx context.Context, id string, network *Network, hypervisorConfig *HypervisorConfig) error { +func (m *mockHypervisor) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { if err := m.setConfig(hypervisorConfig); err != nil { return err } diff --git a/src/runtime/virtcontainers/network.go b/src/runtime/virtcontainers/network.go index a7ba9c6601..cce583ff5a 100644 --- a/src/runtime/virtcontainers/network.go +++ b/src/runtime/virtcontainers/network.go @@ -1,4 +1,5 @@ // Copyright (c) 2016 Intel Corporation +// Copyright (c) 2022 Apple Inc. // // SPDX-License-Identifier: Apache-2.0 // @@ -9,24 +10,16 @@ import ( "context" cryptoRand "crypto/rand" "fmt" - "math/rand" "net" "os" - "os/exec" - "runtime" - "sort" - "time" - "github.com/containernetworking/plugins/pkg/ns" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" - "github.com/vishvananda/netns" - otelTrace "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace" "golang.org/x/sys/unix" "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" "github.com/kata-containers/kata-containers/src/runtime/pkg/uuid" - persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api" pbTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/agent/protocols" "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" ) @@ -38,6 +31,77 @@ var networkTracingTags = map[string]string{ "subsystem": "network", } +func networkLogger() *logrus.Entry { + return virtLog.WithField("subsystem", "network") +} + +var networkTrace = getNetworkTrace("") + +func getNetworkTrace(networkType EndpointType) func(ctx context.Context, name string, endpoint interface{}) (trace.Span, context.Context) { + return func(ctx context.Context, name string, endpoint interface{}) (trace.Span, context.Context) { + span, ctx := katatrace.Trace(ctx, networkLogger(), name, networkTracingTags) + if networkType != "" { + katatrace.AddTags(span, "type", string(networkType)) + } + if endpoint != nil { + katatrace.AddTags(span, "endpoint", endpoint) + } + return span, ctx + } +} + +func closeSpan(span trace.Span, err error) { + if err != nil { + katatrace.AddTags(span, "error", err.Error()) + } + span.End() +} + +// NetworkInterface defines a network interface. +type NetworkInterface struct { + Name string + HardAddr string + Addrs []netlink.Addr +} + +// TapInterface defines a tap interface +type TapInterface struct { + ID string + Name string + TAPIface NetworkInterface + VMFds []*os.File + VhostFds []*os.File +} + +// TuntapInterface defines a tap interface +type TuntapInterface struct { + Name string + TAPIface NetworkInterface +} + +// NetworkInterfacePair defines a pair between VM and virtual network interfaces. +type NetworkInterfacePair struct { + TapInterface + VirtIface NetworkInterface + NetInterworkingModel +} + +// NetlinkIface describes fully a network interface. +type NetlinkIface struct { + netlink.LinkAttrs + Type string +} + +// NetworkInfo gathers all information related to a network interface. +// It can be used to store the description of the underlying network. +type NetworkInfo struct { + Iface NetlinkIface + DNS DNSInfo + Addrs []netlink.Addr + Routes []netlink.Route + Neighbors []netlink.Neigh +} + // NetInterworkingModel defines the network model connecting // the network interface to the virtual machine. type NetInterworkingModel int @@ -116,12 +180,6 @@ func (n *NetInterworkingModel) SetModel(modelName string) error { // the container network interface var DefaultNetInterworkingModel = NetXConnectTCFilterModel -// Introduces constants related to networking -const ( - defaultFilePerms = 0600 - defaultQlen = 1500 -) - // DNSInfo describes the DNS setup related to a network interface. type DNSInfo struct { Servers []string @@ -130,51 +188,6 @@ type DNSInfo struct { Options []string } -// NetlinkIface describes fully a network interface. -type NetlinkIface struct { - netlink.LinkAttrs - Type string -} - -// NetworkInfo gathers all information related to a network interface. -// It can be used to store the description of the underlying network. -type NetworkInfo struct { - Iface NetlinkIface - DNS DNSInfo - Addrs []netlink.Addr - Routes []netlink.Route - Neighbors []netlink.Neigh -} - -// NetworkInterface defines a network interface. -type NetworkInterface struct { - Name string - HardAddr string - Addrs []netlink.Addr -} - -// TapInterface defines a tap interface -type TapInterface struct { - ID string - Name string - TAPIface NetworkInterface - VMFds []*os.File - VhostFds []*os.File -} - -// TuntapInterface defines a tap interface -type TuntapInterface struct { - Name string - TAPIface NetworkInterface -} - -// NetworkInterfacePair defines a pair between VM and virtual network interfaces. -type NetworkInterfacePair struct { - TapInterface - VirtIface NetworkInterface - NetInterworkingModel -} - // NetworkConfig is the network configuration related to a network. type NetworkConfig struct { NetworkID string @@ -183,1049 +196,45 @@ type NetworkConfig struct { DisableNewNetwork bool } -func networkLogger() *logrus.Entry { - return virtLog.WithField("subsystem", "network") +type Network interface { + // Add adds all needed networking endpoints to a sandbox's network + Add(context.Context, *Sandbox, bool) error + + // AddEndpoint adds one single endpoint to a sandbox's network. + AddEndpoint(context.Context, *Sandbox, NetworkInfo, netlink.Link, bool) (Endpoint, error) + + // Remove removes all the networking endpoints from a sandbox's network. + // If the network has been created by virtcontainers, Remove also deletes + // the network. + Remove(context.Context) error + + // RemoveEndpoint removes one single endpoint from the sandbox's network. + RemoveEndpoint(context.Context, *Sandbox, int, bool) error + + // PostAdd is a post networking endpoint addition hook. + PostAdd(context.Context, bool) error + + // Run runs a callback in a sandbox's network. + Run(context.Context, func() error) error + + // NetworkID returns a network unique identifier, + // like e,g. a networking namespace on Linux hosts. + NetworkID() string + + // NetworkCreated returns true if the network has been created + // by virtcontainers. + NetworkCreated() bool + + // Endpoints returns the list of networking endpoints attached to + // the host network. + Endpoints() []Endpoint + + // SetEndpoints sets a sandbox's network endpoints. + SetEndpoints([]Endpoint) } -// Network represents a sandbox networking setup. -type Network struct { - netNSPath string - interworkingModel NetInterworkingModel - netNSCreated bool - - eps []Endpoint - netmonPID int -} - -func NewNetwork(configs ...*NetworkConfig) (*Network, error) { - if len(configs) > 1 { - return nil, fmt.Errorf("Too many network configurations") - } - - // Empty constructor - if len(configs) == 0 { - return &Network{}, nil - } - - config := configs[0] - if config == nil { - return nil, fmt.Errorf("Missing network configuration") - } - - return &Network{ - config.NetworkID, - config.InterworkingModel, - config.NetworkCreated, - []Endpoint{}, - 0, - }, nil -} - -func LoadNetwork(netInfo persistapi.NetworkInfo) *Network { - network := &Network{ - netNSPath: netInfo.NetworkID, - netNSCreated: netInfo.NetworkCreated, - } - - for _, e := range netInfo.Endpoints { - var ep Endpoint - switch EndpointType(e.Type) { - case PhysicalEndpointType: - ep = &PhysicalEndpoint{} - case VethEndpointType: - ep = &VethEndpoint{} - case VhostUserEndpointType: - ep = &VhostUserEndpoint{} - case MacvlanEndpointType: - ep = &MacvlanEndpoint{} - case MacvtapEndpointType: - ep = &MacvtapEndpoint{} - case TapEndpointType: - ep = &TapEndpoint{} - case IPVlanEndpointType: - ep = &IPVlanEndpoint{} - default: - networkLogger().WithField("endpoint-type", e.Type).Error("unknown endpoint type") - continue - } - ep.load(e) - network.eps = append(network.eps, ep) - } - - return network -} - -var networkTrace = getNetworkTrace("") - -func (n *Network) trace(ctx context.Context, name string) (otelTrace.Span, context.Context) { - return networkTrace(ctx, name, nil) -} - -func getNetworkTrace(networkType EndpointType) func(ctx context.Context, name string, endpoint interface{}) (otelTrace.Span, context.Context) { - return func(ctx context.Context, name string, endpoint interface{}) (otelTrace.Span, context.Context) { - span, ctx := katatrace.Trace(ctx, networkLogger(), name, networkTracingTags) - if networkType != "" { - katatrace.AddTags(span, "type", string(networkType)) - } - if endpoint != nil { - katatrace.AddTags(span, "endpoint", endpoint) - } - return span, ctx - } -} - -func closeSpan(span otelTrace.Span, err error) { - if err != nil { - katatrace.AddTags(span, "error", err.Error()) - } - span.End() -} - -func (n *Network) attachEndpoint(ctx context.Context, s *Sandbox, netInfo NetworkInfo, link netlink.Link, hotplug bool) (Endpoint, error) { - var endpoint Endpoint - // TODO: This is the incoming interface - // based on the incoming interface we should create - // an appropriate EndPoint based on interface type - // This should be a switch - - // Check if interface is a physical interface. Do not create - // tap interface/bridge if it is. - isPhysical, err := isPhysicalIface(netInfo.Iface.Name) - if err != nil { - return nil, err - } - - if isPhysical { - networkLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found") - endpoint, err = createPhysicalEndpoint(netInfo) - } else { - var socketPath string - idx := len(n.eps) - - // Check if this is a dummy interface which has a vhost-user socket associated with it - socketPath, err = vhostUserSocketPath(netInfo) - if err != nil { - return nil, err - } - - if socketPath != "" { - networkLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found") - endpoint, err = createVhostUserEndpoint(netInfo, socketPath) - } else if netInfo.Iface.Type == "macvlan" { - networkLogger().Infof("macvlan interface found") - endpoint, err = createMacvlanNetworkEndpoint(idx, netInfo.Iface.Name, n.interworkingModel) - } else if netInfo.Iface.Type == "macvtap" { - networkLogger().Infof("macvtap interface found") - endpoint, err = createMacvtapNetworkEndpoint(netInfo) - } else if netInfo.Iface.Type == "tap" { - networkLogger().Info("tap interface found") - endpoint, err = createTapNetworkEndpoint(idx, netInfo.Iface.Name) - } else if netInfo.Iface.Type == "tuntap" { - if link != nil { - switch link.(*netlink.Tuntap).Mode { - case 0: - // mount /sys/class/net to get links - return nil, fmt.Errorf("Network device mode not determined correctly. Mount sysfs in caller") - case 1: - return nil, fmt.Errorf("tun networking device not yet supported") - case 2: - networkLogger().Info("tuntap tap interface found") - endpoint, err = createTuntapNetworkEndpoint(idx, netInfo.Iface.Name, netInfo.Iface.HardwareAddr, n.interworkingModel) - default: - return nil, fmt.Errorf("tuntap network %v mode unsupported", link.(*netlink.Tuntap).Mode) - } - } - } else if netInfo.Iface.Type == "veth" { - networkLogger().Info("veth interface found") - endpoint, err = createVethNetworkEndpoint(idx, netInfo.Iface.Name, n.interworkingModel) - } else if netInfo.Iface.Type == "ipvlan" { - networkLogger().Info("ipvlan interface found") - endpoint, err = createIPVlanNetworkEndpoint(idx, netInfo.Iface.Name) - } else { - return nil, fmt.Errorf("Unsupported network interface: %s", netInfo.Iface.Type) - } - } - - endpoint.SetProperties(netInfo) - - if err := doNetNS(n.netNSPath, func(_ ns.NetNS) error { - networkLogger().WithField("endpoint-type", endpoint.Type()).WithField("hotplug", hotplug).Info("Attaching endpoint") - if hotplug { - if err := endpoint.HotAttach(ctx, s.hypervisor); err != nil { - return err - } - } else { - if err := endpoint.Attach(ctx, s); err != nil { - return err - } - } - - if !s.hypervisor.IsRateLimiterBuiltin() { - rxRateLimiterMaxRate := s.hypervisor.HypervisorConfig().RxRateLimiterMaxRate - if rxRateLimiterMaxRate > 0 { - networkLogger().Info("Add Rx Rate Limiter") - if err := addRxRateLimiter(endpoint, rxRateLimiterMaxRate); err != nil { - return err - } - } - txRateLimiterMaxRate := s.hypervisor.HypervisorConfig().TxRateLimiterMaxRate - if txRateLimiterMaxRate > 0 { - networkLogger().Info("Add Tx Rate Limiter") - if err := addTxRateLimiter(endpoint, txRateLimiterMaxRate); err != nil { - return err - } - } - } - - return nil - }); err != nil { - return nil, err - } - - n.eps = append(n.eps, endpoint) - - return endpoint, nil -} - -func (n *Network) detachEndpoint(ctx context.Context, s *Sandbox, idx int, hotplug bool) error { - if idx > len(n.eps)-1 { - return fmt.Errorf("Enpoint index overflow") - } - - endpoint := n.eps[idx] - - if endpoint.GetRxRateLimiter() { - networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Deleting rx rate limiter") - // Deleting rx rate limiter should enter the network namespace. - if err := removeRxRateLimiter(endpoint, n.netNSPath); err != nil { - return err - } - } - - if endpoint.GetTxRateLimiter() { - networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Deleting tx rate limiter") - // Deleting tx rate limiter should enter the network namespace. - if err := removeTxRateLimiter(endpoint, n.netNSPath); err != nil { - return err - } - } - - // Detach for an endpoint should enter the network namespace - // if required. - networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Detaching endpoint") - if hotplug && s != nil { - if err := endpoint.HotDetach(ctx, s.hypervisor, n.netNSCreated, n.netNSPath); err != nil { - return err - } - } else { - if err := endpoint.Detach(ctx, n.netNSCreated, n.netNSPath); err != nil { - return err - } - } - - n.eps = append(n.eps[:idx], n.eps[idx+1:]...) - - return nil -} - -// Scan the networking namespace through netlink and then: -// 1. Create the endpoints for the relevant interfaces found there. -// 2. Attach them to the VM. -func (n *Network) attachEndpoints(ctx context.Context, s *Sandbox, hotplug bool) error { - netnsHandle, err := netns.GetFromPath(n.netNSPath) - if err != nil { - return err - } - defer netnsHandle.Close() - - netlinkHandle, err := netlink.NewHandleAt(netnsHandle) - if err != nil { - return err - } - defer netlinkHandle.Close() - - linkList, err := netlinkHandle.LinkList() - if err != nil { - return err - } - - for _, link := range linkList { - netInfo, err := networkInfoFromLink(netlinkHandle, link) - if err != nil { - return err - } - - // Ignore unconfigured network interfaces. These are - // either base tunnel devices that are not namespaced - // like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly - // setup interfaces. - if len(netInfo.Addrs) == 0 { - continue - } - - // Skip any loopback interfaces: - if (netInfo.Iface.Flags & net.FlagLoopback) != 0 { - continue - } - - _, err = n.attachEndpoint(ctx, s, netInfo, link, hotplug) - if err != nil { - return err - } - } - - sort.Slice(n.eps, func(i, j int) bool { - return n.eps[i].Name() < n.eps[j].Name() - }) - - networkLogger().WithField("endpoints", n.eps).Info("endpoints found after scan") - - return nil -} - -// Run runs a callback in the specified network namespace. -func (n *Network) Run(ctx context.Context, cb func() error) error { - span, _ := n.trace(ctx, "Run") - defer span.End() - - return doNetNS(n.netNSPath, func(_ ns.NetNS) error { - return cb() - }) -} - -// Add adds all needed interfaces inside the network namespace. -func (n *Network) Add(ctx context.Context, s *Sandbox, hotplug bool) error { - span, ctx := n.trace(ctx, "Add") - katatrace.AddTags(span, "type", n.interworkingModel.GetModel()) - defer span.End() - - if err := n.attachEndpoints(ctx, s, hotplug); err != nil { - return err - } - - katatrace.AddTags(span, "endpoints", n.eps, "hotplug", hotplug) - networkLogger().Debug("Network added") - - return nil -} - -func (n *Network) PostAdd(ctx context.Context, hotplug bool) error { - if hotplug { - return nil - } - - if n.eps == nil { - return nil - } - - endpoints := n.eps - - for _, endpoint := range endpoints { - netPair := endpoint.NetworkPair() - if netPair == nil { - continue - } - if netPair.VhostFds != nil { - for _, VhostFd := range netPair.VhostFds { - VhostFd.Close() - } - } - } - - return nil -} - -// Remove network endpoints in the network namespace. It also deletes the network -// namespace in case the namespace has been created by us. -func (n *Network) Remove(ctx context.Context) error { - span, ctx := n.trace(ctx, "Remove") - defer span.End() - - for i, _ := range n.eps { - if err := n.detachEndpoint(ctx, nil, i, false); err != nil { - return err - } - } - - networkLogger().Debug("Network removed") - - if n.netNSCreated { - networkLogger().Infof("Network namespace %q deleted", n.netNSPath) - return deleteNetNS(n.netNSPath) - } - - return nil -} - -// Network getters -func (n *Network) NetworkID() string { - return n.netNSPath -} - -func (n *Network) NetworkCreated() bool { - return n.netNSCreated -} - -func (n *Network) Endpoints() []Endpoint { - return n.eps -} - -func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link, queues int) (netlink.Link, []*os.File, error) { - var newLink netlink.Link - var fds []*os.File - - switch expectedLink.Type() { - case (&netlink.Tuntap{}).Type(): - flags := netlink.TUNTAP_VNET_HDR - if queues > 0 { - flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS - } - newLink = &netlink.Tuntap{ - LinkAttrs: netlink.LinkAttrs{Name: name}, - Mode: netlink.TUNTAP_MODE_TAP, - Queues: queues, - Flags: flags, - } - case (&netlink.Macvtap{}).Type(): - qlen := expectedLink.Attrs().TxQLen - if qlen <= 0 { - qlen = defaultQlen - } - newLink = &netlink.Macvtap{ - Macvlan: netlink.Macvlan{ - Mode: netlink.MACVLAN_MODE_BRIDGE, - LinkAttrs: netlink.LinkAttrs{ - Index: expectedLink.Attrs().Index, - Name: name, - TxQLen: qlen, - ParentIndex: expectedLink.Attrs().ParentIndex, - }, - }, - } - default: - return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) - } - - if err := netHandle.LinkAdd(newLink); err != nil { - return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err) - } - - tuntapLink, ok := newLink.(*netlink.Tuntap) - if ok { - fds = tuntapLink.Fds - } - - newLink, err := getLinkByName(netHandle, name, expectedLink) - return newLink, fds, err -} - -func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.Link, error) { - var link netlink.Link - - switch ep := endpoint.(type) { - case *VethEndpoint: - link = &netlink.Veth{} - case *MacvlanEndpoint: - link = &netlink.Macvlan{} - case *IPVlanEndpoint: - link = &netlink.IPVlan{} - case *TuntapEndpoint: - link = &netlink.Tuntap{} - default: - return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type()) - } - - return getLinkByName(netHandle, endpoint.NetworkPair().VirtIface.Name, link) -} - -func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) { - link, err := netHandle.LinkByName(name) - if err != nil { - return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err) - } - - switch expectedLink.Type() { - case (&netlink.Tuntap{}).Type(): - if l, ok := link.(*netlink.Tuntap); ok { - return l, nil - } - case (&netlink.Veth{}).Type(): - if l, ok := link.(*netlink.Veth); ok { - return l, nil - } - case (&netlink.Macvtap{}).Type(): - if l, ok := link.(*netlink.Macvtap); ok { - return l, nil - } - case (&netlink.Macvlan{}).Type(): - if l, ok := link.(*netlink.Macvlan); ok { - return l, nil - } - case (&netlink.IPVlan{}).Type(): - if l, ok := link.(*netlink.IPVlan); ok { - return l, nil - } - default: - return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) - } - - return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type()) -} - -// The endpoint type should dictate how the connection needs to happen. -func xConnectVMNetwork(ctx context.Context, endpoint Endpoint, h Hypervisor) error { - var err error - - span, ctx := networkTrace(ctx, "xConnectVMNetwork", endpoint) - defer closeSpan(span, err) - - netPair := endpoint.NetworkPair() - - queues := 0 - caps := h.Capabilities(ctx) - if caps.IsMultiQueueSupported() { - queues = int(h.HypervisorConfig().NumVCPUs) - } - - disableVhostNet := h.HypervisorConfig().DisableVhostNet - - if netPair.NetInterworkingModel == NetXConnectDefaultModel { - netPair.NetInterworkingModel = DefaultNetInterworkingModel - } - - switch netPair.NetInterworkingModel { - case NetXConnectMacVtapModel: - networkLogger().Info("connect macvtap to VM network") - err = tapNetworkPair(ctx, endpoint, queues, disableVhostNet) - case NetXConnectTCFilterModel: - networkLogger().Info("connect TCFilter to VM network") - err = setupTCFiltering(ctx, endpoint, queues, disableVhostNet) - default: - err = fmt.Errorf("Invalid internetworking model") - } - return err -} - -// The endpoint type should dictate how the disconnection needs to happen. -func xDisconnectVMNetwork(ctx context.Context, endpoint Endpoint) error { - var err error - - span, ctx := networkTrace(ctx, "xDisconnectVMNetwork", endpoint) - defer closeSpan(span, err) - - netPair := endpoint.NetworkPair() - - if netPair.NetInterworkingModel == NetXConnectDefaultModel { - netPair.NetInterworkingModel = DefaultNetInterworkingModel - } - - switch netPair.NetInterworkingModel { - case NetXConnectMacVtapModel: - err = untapNetworkPair(ctx, endpoint) - case NetXConnectTCFilterModel: - err = removeTCFiltering(ctx, endpoint) - default: - err = fmt.Errorf("Invalid internetworking model") - } - return err -} - -func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) { - tapDev := fmt.Sprintf("/dev/tap%d", linkIndex) - return createFds(tapDev, queues) -} - -func createVhostFds(numFds int) ([]*os.File, error) { - vhostDev := "/dev/vhost-net" - return createFds(vhostDev, numFds) -} - -func createFds(device string, numFds int) ([]*os.File, error) { - fds := make([]*os.File, numFds) - - for i := 0; i < numFds; i++ { - f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms) - if err != nil { - utils.CleanupFds(fds, i) - return nil, err - } - fds[i] = f - } - return fds, nil -} - -// There is a limitation in the linux kernel that prevents a macvtap/macvlan link -// from getting the correct link index when created in a network namespace -// https://github.com/clearcontainers/runtime/issues/708 -// -// Till that bug is fixed we need to pick a random non conflicting index and try to -// create a link. If that fails, we need to try with another. -// All the kernel does not Check if the link id conflicts with a link id on the host -// hence we need to offset the link id to prevent any overlaps with the host index -// -// Here the kernel will ensure that there is no race condition - -const hostLinkOffset = 8192 // Host should not have more than 8k interfaces -const linkRange = 0xFFFF // This will allow upto 2^16 containers -const linkRetries = 128 // The numbers of time we try to find a non conflicting index -const macvtapWorkaround = true - -func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link, queues int) (taplink netlink.Link, err error) { - - if !macvtapWorkaround { - taplink, _, err = createLink(netHandle, name, link, queues) - return - } - - r := rand.New(rand.NewSource(time.Now().UnixNano())) - - for i := 0; i < linkRetries; i++ { - index := hostLinkOffset + (r.Int() & linkRange) - link.Attrs().Index = index - taplink, _, err = createLink(netHandle, name, link, queues) - if err == nil { - break - } - } - - return -} - -func clearIPs(link netlink.Link, addrs []netlink.Addr) error { - for _, addr := range addrs { - if err := netlink.AddrDel(link, &addr); err != nil { - return err - } - } - return nil -} - -func setIPs(link netlink.Link, addrs []netlink.Addr) error { - for _, addr := range addrs { - if err := netlink.AddrAdd(link, &addr); err != nil { - return err - } - } - return nil -} - -func tapNetworkPair(ctx context.Context, endpoint Endpoint, queues int, disableVhostNet bool) error { - span, _ := networkTrace(ctx, "tapNetworkPair", endpoint) - defer span.End() - - netHandle, err := netlink.NewHandle() - if err != nil { - return err - } - defer netHandle.Close() - - netPair := endpoint.NetworkPair() - - link, err := getLinkForEndpoint(endpoint, netHandle) - if err != nil { - return err - } - - attrs := link.Attrs() - - // Attach the macvtap interface to the underlying container - // interface. Also picks relevant attributes from the parent - tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name, - &netlink.Macvtap{ - Macvlan: netlink.Macvlan{ - LinkAttrs: netlink.LinkAttrs{ - TxQLen: attrs.TxQLen, - ParentIndex: attrs.Index, - }, - }, - }, queues) - - if err != nil { - return fmt.Errorf("Could not create TAP interface: %s", err) - } - - // Save the veth MAC address to the TAP so that it can later be used - // to build the hypervisor command line. This MAC address has to be - // the one inside the VM in order to avoid any firewall issues. The - // bridge created by the network plugin on the host actually expects - // to see traffic from this MAC address and not another one. - tapHardAddr := attrs.HardwareAddr - netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() - - if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { - return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) - } - - hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr) - if err != nil { - return err - } - if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { - return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", - netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) - } - - if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil { - return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", - netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) - } - - if err := netHandle.LinkSetUp(tapLink); err != nil { - return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) - } - - // Clear the IP addresses from the veth interface to prevent ARP conflict - netPair.VirtIface.Addrs, err = netlink.AddrList(link, netlink.FAMILY_ALL) - if err != nil { - return fmt.Errorf("Unable to obtain veth IP addresses: %s", err) - } - - if err := clearIPs(link, netPair.VirtIface.Addrs); err != nil { - return fmt.Errorf("Unable to clear veth IP addresses: %s", err) - } - - if err := netHandle.LinkSetUp(link); err != nil { - return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err) - } - - // Note: The underlying interfaces need to be up prior to fd creation. - - netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, queues) - if err != nil { - return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err) - } - - if !disableVhostNet { - vhostFds, err := createVhostFds(queues) - if err != nil { - return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) - } - netPair.VhostFds = vhostFds - } - - return nil -} - -func setupTCFiltering(ctx context.Context, endpoint Endpoint, queues int, disableVhostNet bool) error { - span, _ := networkTrace(ctx, "setupTCFiltering", endpoint) - defer span.End() - - netHandle, err := netlink.NewHandle() - if err != nil { - return err - } - defer netHandle.Close() - - netPair := endpoint.NetworkPair() - - tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}, queues) - if err != nil { - return fmt.Errorf("Could not create TAP interface: %s", err) - } - netPair.VMFds = fds - - if !disableVhostNet { - vhostFds, err := createVhostFds(queues) - if err != nil { - return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) - } - netPair.VhostFds = vhostFds - } - - var attrs *netlink.LinkAttrs - var link netlink.Link - - link, err = getLinkForEndpoint(endpoint, netHandle) - if err != nil { - return err - } - - attrs = link.Attrs() - - // Save the veth MAC address to the TAP so that it can later be used - // to build the Hypervisor command line. This MAC address has to be - // the one inside the VM in order to avoid any firewall issues. The - // bridge created by the network plugin on the host actually expects - // to see traffic from this MAC address and not another one. - netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() - - if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { - return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) - } - - if err := netHandle.LinkSetUp(tapLink); err != nil { - return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) - } - - tapAttrs := tapLink.Attrs() - - if err := addQdiscIngress(tapAttrs.Index); err != nil { - return err - } - - if err := addQdiscIngress(attrs.Index); err != nil { - return err - } - - if err := addRedirectTCFilter(attrs.Index, tapAttrs.Index); err != nil { - return err - } - - if err := addRedirectTCFilter(tapAttrs.Index, attrs.Index); err != nil { - return err - } - - return nil -} - -// addQdiscIngress creates a new qdisc for network interface with the specified network index -// on "ingress". qdiscs normally don't work on ingress so this is really a special qdisc -// that you can consider an "alternate root" for inbound packets. -// Handle for ingress qdisc defaults to "ffff:" -// -// This is equivalent to calling `tc qdisc add dev eth0 ingress` -func addQdiscIngress(index int) error { - qdisc := &netlink.Ingress{ - QdiscAttrs: netlink.QdiscAttrs{ - LinkIndex: index, - Parent: netlink.HANDLE_INGRESS, - }, - } - - err := netlink.QdiscAdd(qdisc) - if err != nil { - return fmt.Errorf("Failed to add qdisc for network index %d : %s", index, err) - } - - return nil -} - -// addRedirectTCFilter adds a tc filter for device with index "sourceIndex". -// All traffic for interface with index "sourceIndex" is redirected to interface with -// index "destIndex" -// -// This is equivalent to calling: -// `tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev dest` -func addRedirectTCFilter(sourceIndex, destIndex int) error { - filter := &netlink.U32{ - FilterAttrs: netlink.FilterAttrs{ - LinkIndex: sourceIndex, - Parent: netlink.MakeHandle(0xffff, 0), - Protocol: unix.ETH_P_ALL, - }, - Actions: []netlink.Action{ - &netlink.MirredAction{ - ActionAttrs: netlink.ActionAttrs{ - Action: netlink.TC_ACT_STOLEN, - }, - MirredAction: netlink.TCA_EGRESS_REDIR, - Ifindex: destIndex, - }, - }, - } - - if err := netlink.FilterAdd(filter); err != nil { - return fmt.Errorf("Failed to add filter for index %d : %s", sourceIndex, err) - } - - return nil -} - -// removeRedirectTCFilter removes all tc u32 filters created on ingress qdisc for "link". -func removeRedirectTCFilter(link netlink.Link) error { - if link == nil { - return nil - } - - // Handle 0xffff is used for ingress - filters, err := netlink.FilterList(link, netlink.MakeHandle(0xffff, 0)) - if err != nil { - return err - } - - for _, f := range filters { - u32, ok := f.(*netlink.U32) - - if !ok { - continue - } - - if err := netlink.FilterDel(u32); err != nil { - return err - } - } - return nil -} - -// removeQdiscIngress removes the ingress qdisc previously created on "link". -func removeQdiscIngress(link netlink.Link) error { - if link == nil { - return nil - } - - qdiscs, err := netlink.QdiscList(link) - if err != nil { - return err - } - - for _, qdisc := range qdiscs { - ingress, ok := qdisc.(*netlink.Ingress) - if !ok { - continue - } - - if err := netlink.QdiscDel(ingress); err != nil { - return err - } - } - return nil -} - -func untapNetworkPair(ctx context.Context, endpoint Endpoint) error { - span, _ := networkTrace(ctx, "untapNetworkPair", endpoint) - defer span.End() - - netHandle, err := netlink.NewHandle() - if err != nil { - return err - } - defer netHandle.Close() - - netPair := endpoint.NetworkPair() - - tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{}) - if err != nil { - return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err) - } - - if err := netHandle.LinkDel(tapLink); err != nil { - return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) - } - - link, err := getLinkForEndpoint(endpoint, netHandle) - if err != nil { - return err - } - - hardAddr, err := net.ParseMAC(netPair.TAPIface.HardAddr) - if err != nil { - return err - } - if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { - return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", - netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) - } - - if err := netHandle.LinkSetDown(link); err != nil { - return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) - } - - // Restore the IPs that were cleared - err = setIPs(link, netPair.VirtIface.Addrs) - return err -} - -func removeTCFiltering(ctx context.Context, endpoint Endpoint) error { - span, _ := networkTrace(ctx, "removeTCFiltering", endpoint) - defer span.End() - - netHandle, err := netlink.NewHandle() - if err != nil { - return err - } - defer netHandle.Close() - - netPair := endpoint.NetworkPair() - - tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}) - if err != nil { - return fmt.Errorf("Could not get TAP interface: %s", err) - } - - if err := netHandle.LinkSetDown(tapLink); err != nil { - return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err) - } - - if err := netHandle.LinkDel(tapLink); err != nil { - return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) - } - - link, err := getLinkForEndpoint(endpoint, netHandle) - if err != nil { - return err - } - - if err := removeRedirectTCFilter(link); err != nil { - return err - } - - if err := removeQdiscIngress(link); err != nil { - return err - } - - if err := netHandle.LinkSetDown(link); err != nil { - return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) - } - - return nil -} - -// doNetNS is free from any call to a go routine, and it calls -// into runtime.LockOSThread(), meaning it won't be executed in a -// different thread than the one expected by the caller. -func doNetNS(netNSPath string, cb func(ns.NetNS) error) error { - // if netNSPath is empty, the callback function will be run in the current network namespace. - // So skip the whole function, just call cb(). cb() needs a NetNS as arg but ignored, give it a fake one. - if netNSPath == "" { - var netNs ns.NetNS - return cb(netNs) - } - - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - currentNS, err := ns.GetCurrentNS() - if err != nil { - return err - } - defer currentNS.Close() - - targetNS, err := ns.GetNS(netNSPath) - if err != nil { - return err - } - - if err := targetNS.Set(); err != nil { - return err - } - defer currentNS.Set() - - return cb(targetNS) -} - -func deleteNetNS(netNSPath string) error { - n, err := ns.GetNS(netNSPath) - if err != nil { - return err - } - - err = n.Close() - if err != nil { - return err - } - - if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil { - return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err) - } - if err := os.RemoveAll(netNSPath); err != nil { - return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err) - } - - return nil -} - -func generateVCNetworkStructures(ctx context.Context, network *Network) ([]*pbTypes.Interface, []*pbTypes.Route, []*pbTypes.ARPNeighbor, error) { - if network.netNSPath == "" { +func generateVCNetworkStructures(ctx context.Context, network Network) ([]*pbTypes.Interface, []*pbTypes.Route, []*pbTypes.ARPNeighbor, error) { + if network.NetworkID() == "" { return nil, nil, nil, nil } span, _ := networkTrace(ctx, "generateVCNetworkStructures", nil) @@ -1235,7 +244,7 @@ func generateVCNetworkStructures(ctx context.Context, network *Network) ([]*pbTy var ifaces []*pbTypes.Interface var neighs []*pbTypes.ARPNeighbor - for _, endpoint := range network.eps { + for _, endpoint := range network.Endpoints() { var ipAddresses []*pbTypes.IPAddress for _, addr := range endpoint.Properties().Addrs { // Skip localhost interface @@ -1370,414 +379,3 @@ func generateRandomPrivateMacAddr() (string, error) { hardAddr := net.HardwareAddr(buf) return hardAddr.String(), nil } - -func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) { - addrs, err := handle.AddrList(link, netlink.FAMILY_ALL) - if err != nil { - return NetworkInfo{}, err - } - - routes, err := handle.RouteList(link, netlink.FAMILY_ALL) - if err != nil { - return NetworkInfo{}, err - } - - neighbors, err := handle.NeighList(link.Attrs().Index, netlink.FAMILY_ALL) - if err != nil { - return NetworkInfo{}, err - } - - return NetworkInfo{ - Iface: NetlinkIface{ - LinkAttrs: *(link.Attrs()), - Type: link.Type(), - }, - Addrs: addrs, - Routes: routes, - Neighbors: neighbors, - }, nil -} - -func createEndpoint(netInfo NetworkInfo, idx int, model NetInterworkingModel, link netlink.Link) (Endpoint, error) { - var endpoint Endpoint - // TODO: This is the incoming interface - // based on the incoming interface we should create - // an appropriate EndPoint based on interface type - // This should be a switch - - // Check if interface is a physical interface. Do not create - // tap interface/bridge if it is. - isPhysical, err := isPhysicalIface(netInfo.Iface.Name) - if err != nil { - return nil, err - } - - if isPhysical { - networkLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found") - endpoint, err = createPhysicalEndpoint(netInfo) - } else { - var socketPath string - - // Check if this is a dummy interface which has a vhost-user socket associated with it - socketPath, err = vhostUserSocketPath(netInfo) - if err != nil { - return nil, err - } - - if socketPath != "" { - networkLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found") - endpoint, err = createVhostUserEndpoint(netInfo, socketPath) - } else if netInfo.Iface.Type == "macvlan" { - networkLogger().Infof("macvlan interface found") - endpoint, err = createMacvlanNetworkEndpoint(idx, netInfo.Iface.Name, model) - } else if netInfo.Iface.Type == "macvtap" { - networkLogger().Infof("macvtap interface found") - endpoint, err = createMacvtapNetworkEndpoint(netInfo) - } else if netInfo.Iface.Type == "tap" { - networkLogger().Info("tap interface found") - endpoint, err = createTapNetworkEndpoint(idx, netInfo.Iface.Name) - } else if netInfo.Iface.Type == "tuntap" { - if link != nil { - switch link.(*netlink.Tuntap).Mode { - case 0: - // mount /sys/class/net to get links - return nil, fmt.Errorf("Network device mode not determined correctly. Mount sysfs in caller") - case 1: - return nil, fmt.Errorf("tun networking device not yet supported") - case 2: - networkLogger().Info("tuntap tap interface found") - endpoint, err = createTuntapNetworkEndpoint(idx, netInfo.Iface.Name, netInfo.Iface.HardwareAddr, model) - default: - return nil, fmt.Errorf("tuntap network %v mode unsupported", link.(*netlink.Tuntap).Mode) - } - } - } else if netInfo.Iface.Type == "veth" { - networkLogger().Info("veth interface found") - endpoint, err = createVethNetworkEndpoint(idx, netInfo.Iface.Name, model) - } else if netInfo.Iface.Type == "ipvlan" { - networkLogger().Info("ipvlan interface found") - endpoint, err = createIPVlanNetworkEndpoint(idx, netInfo.Iface.Name) - } else { - return nil, fmt.Errorf("Unsupported network interface: %s", netInfo.Iface.Type) - } - } - - return endpoint, err -} - -// func addRxRateLmiter implements tc-based rx rate limiter to control network I/O inbound traffic -// on VM level for hypervisors which don't implement rate limiter in itself, like qemu, etc. -func addRxRateLimiter(endpoint Endpoint, maxRate uint64) error { - var linkName string - switch ep := endpoint.(type) { - case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: - netPair := endpoint.NetworkPair() - linkName = netPair.TapInterface.TAPIface.Name - case *MacvtapEndpoint, *TapEndpoint: - linkName = endpoint.Name() - default: - return fmt.Errorf("Unsupported endpointType %s for adding rx rate limiter", ep.Type()) - } - - if err := endpoint.SetRxRateLimiter(); err != nil { - return nil - } - - link, err := netlink.LinkByName(linkName) - if err != nil { - return err - } - linkIndex := link.Attrs().Index - - return addHTBQdisc(linkIndex, maxRate) -} - -// func addHTBQdisc uses HTB(Hierarchical Token Bucket) qdisc shaping schemes to control interface traffic. -// HTB (Hierarchical Token Bucket) shapes traffic based on the Token Bucket Filter algorithm. -// A fundamental part of the HTB qdisc is the borrowing mechanism. Children classes borrow tokens -// from their parents once they have exceeded rate. A child class will continue to attempt to borrow until -// it reaches ceil. See more details in https://tldp.org/HOWTO/Traffic-Control-HOWTO/classful-qdiscs.html. -// -// * +-----+ +---------+ +-----------+ +-----------+ -// * | | | qdisc | | class 1:1 | | class 1:2 | -// * | NIC | | htb | | rate | | rate | -// * | | --> | def 1:2 | --> | ceil | -+-> | ceil | -// * +-----+ +---------+ +-----------+ | +-----------+ -// * | -// * | +-----------+ -// * | | class 1:n | -// * | | rate | -// * +-> | ceil | -// * | +-----------+ -// Seeing from pic, after the routing decision, all packets will be sent to the interface root htb qdisc. -// This root qdisc has only one direct child class (with id 1:1) which shapes the overall maximum rate -// that will be sent through interface. Then, this class has at least one default child (1:2) meant to control all -// non-privileged traffic. -// e.g. -// if we try to set VM bandwidth with maximum 10Mbit/s, we should give -// classid 1:2 rate 10Mbit/s, ceil 10Mbit/s and classid 1:1 rate 10Mbit/s, ceil 10Mbit/s. -// To-do: -// Later, if we want to do limitation on some dedicated traffic(special process running in VM), we could create -// a separate class (1:n) with guarantee throughput. -func addHTBQdisc(linkIndex int, maxRate uint64) error { - // we create a new htb root qdisc for network interface with the specified network index - qdiscAttrs := netlink.QdiscAttrs{ - LinkIndex: linkIndex, - Handle: netlink.MakeHandle(1, 0), - Parent: netlink.HANDLE_ROOT, - } - qdisc := netlink.NewHtb(qdiscAttrs) - // all non-privileged traffic go to classid 1:2. - qdisc.Defcls = 2 - - err := netlink.QdiscAdd(qdisc) - if err != nil { - return fmt.Errorf("Failed to add htb qdisc: %v", err) - } - - // root htb qdisc has only one direct child class (with id 1:1) to control overall rate. - classAttrs := netlink.ClassAttrs{ - LinkIndex: linkIndex, - Parent: netlink.MakeHandle(1, 0), - Handle: netlink.MakeHandle(1, 1), - } - htbClassAttrs := netlink.HtbClassAttrs{ - Rate: maxRate, - Ceil: maxRate, - } - class := netlink.NewHtbClass(classAttrs, htbClassAttrs) - if err := netlink.ClassAdd(class); err != nil { - return fmt.Errorf("Failed to add htb classid 1:1 : %v", err) - } - - // above class has at least one default child class(1:2) for all non-privileged traffic. - classAttrs = netlink.ClassAttrs{ - LinkIndex: linkIndex, - Parent: netlink.MakeHandle(1, 1), - Handle: netlink.MakeHandle(1, 2), - } - htbClassAttrs = netlink.HtbClassAttrs{ - Rate: maxRate, - Ceil: maxRate, - } - class = netlink.NewHtbClass(classAttrs, htbClassAttrs) - if err := netlink.ClassAdd(class); err != nil { - return fmt.Errorf("Failed to add htb class 1:2 : %v", err) - } - - return nil -} - -// The Intermediate Functional Block (ifb) pseudo network interface is an alternative -// to tc filters for handling ingress traffic, -// By redirecting interface ingress traffic to ifb and treat it as egress traffic there, -// we could do network shaping to interface inbound traffic. -func addIFBDevice() (int, error) { - // Check whether host supports ifb - if ok, err := utils.SupportsIfb(); !ok { - return -1, err - } - - netHandle, err := netlink.NewHandle() - if err != nil { - return -1, err - } - defer netHandle.Close() - - // There exists error when using netlink library to create ifb interface - cmd := exec.Command("ip", "link", "add", "dev", "ifb0", "type", "ifb") - if output, err := cmd.CombinedOutput(); err != nil { - return -1, fmt.Errorf("Could not create link ifb0: %v, error %v", output, err) - } - - ifbLink, err := netlink.LinkByName("ifb0") - if err != nil { - return -1, err - } - - if err := netHandle.LinkSetUp(ifbLink); err != nil { - return -1, fmt.Errorf("Could not enable link ifb0 %v", err) - } - - return ifbLink.Attrs().Index, nil -} - -// This is equivalent to calling: -// tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev ifb -func addIFBRedirecting(sourceIndex int, ifbIndex int) error { - if err := addQdiscIngress(sourceIndex); err != nil { - return err - } - - if err := addRedirectTCFilter(sourceIndex, ifbIndex); err != nil { - return err - } - - return nil -} - -// addTxRateLimiter implements tx rate limiter to control network I/O outbound traffic -// on VM level for hypervisors which don't implement rate limiter in itself, like qemu, etc. -// We adopt different actions, based on different inter-networking models. -// For tcfilters as inter-networking model, we simply apply htb qdisc discipline to the virtual netpair. -// For other inter-networking models, such as macvtap, we resort to ifb, by redirecting endpoint ingress traffic -// to ifb egress, and then apply htb to ifb egress. -func addTxRateLimiter(endpoint Endpoint, maxRate uint64) error { - var netPair *NetworkInterfacePair - var linkName string - switch ep := endpoint.(type) { - case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: - netPair = endpoint.NetworkPair() - switch netPair.NetInterworkingModel { - // For those endpoints we've already used tcfilter as their inter-networking model, - // another ifb redirect will be redundant and confused. - case NetXConnectTCFilterModel: - linkName = netPair.VirtIface.Name - link, err := netlink.LinkByName(linkName) - if err != nil { - return err - } - return addHTBQdisc(link.Attrs().Index, maxRate) - case NetXConnectMacVtapModel, NetXConnectNoneModel: - linkName = netPair.TapInterface.TAPIface.Name - default: - return fmt.Errorf("Unsupported inter-networking model %v for adding tx rate limiter", netPair.NetInterworkingModel) - } - - case *MacvtapEndpoint, *TapEndpoint: - linkName = endpoint.Name() - default: - return fmt.Errorf("Unsupported endpointType %s for adding tx rate limiter", ep.Type()) - } - - if err := endpoint.SetTxRateLimiter(); err != nil { - return err - } - - ifbIndex, err := addIFBDevice() - if err != nil { - return err - } - - link, err := netlink.LinkByName(linkName) - if err != nil { - return err - } - - if err := addIFBRedirecting(link.Attrs().Index, ifbIndex); err != nil { - return err - } - - return addHTBQdisc(ifbIndex, maxRate) -} - -func removeHTBQdisc(linkName string) error { - link, err := netlink.LinkByName(linkName) - if err != nil { - return fmt.Errorf("Get link %s by name failed: %v", linkName, err) - } - - qdiscs, err := netlink.QdiscList(link) - if err != nil { - return err - } - - for _, qdisc := range qdiscs { - htb, ok := qdisc.(*netlink.Htb) - if !ok { - continue - } - - if err := netlink.QdiscDel(htb); err != nil { - return fmt.Errorf("Failed to delete htb qdisc on link %s: %v", linkName, err) - } - } - - return nil -} - -func removeRxRateLimiter(endpoint Endpoint, networkNSPath string) error { - var linkName string - switch ep := endpoint.(type) { - case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: - netPair := endpoint.NetworkPair() - linkName = netPair.TapInterface.TAPIface.Name - case *MacvtapEndpoint, *TapEndpoint: - linkName = endpoint.Name() - default: - return fmt.Errorf("Unsupported endpointType %s for removing rx rate limiter", ep.Type()) - } - - if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { - return removeHTBQdisc(linkName) - }); err != nil { - return err - } - - return nil -} - -func removeTxRateLimiter(endpoint Endpoint, networkNSPath string) error { - var linkName string - switch ep := endpoint.(type) { - case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: - netPair := endpoint.NetworkPair() - switch netPair.NetInterworkingModel { - case NetXConnectTCFilterModel: - linkName = netPair.VirtIface.Name - if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { - return removeHTBQdisc(linkName) - }); err != nil { - return err - } - return nil - case NetXConnectMacVtapModel, NetXConnectNoneModel: - linkName = netPair.TapInterface.TAPIface.Name - } - case *MacvtapEndpoint, *TapEndpoint: - linkName = endpoint.Name() - default: - return fmt.Errorf("Unsupported endpointType %s for adding tx rate limiter", ep.Type()) - } - - if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { - link, err := netlink.LinkByName(linkName) - if err != nil { - return fmt.Errorf("Get link %s by name failed: %v", linkName, err) - } - - if err := removeRedirectTCFilter(link); err != nil { - return err - } - - if err := removeQdiscIngress(link); err != nil { - return err - } - - netHandle, err := netlink.NewHandle() - if err != nil { - return err - } - defer netHandle.Close() - - // remove ifb interface - ifbLink, err := netlink.LinkByName("ifb0") - if err != nil { - return fmt.Errorf("Get link %s by name failed: %v", linkName, err) - } - - if err := netHandle.LinkSetDown(ifbLink); err != nil { - return fmt.Errorf("Could not disable ifb interface: %v", err) - } - - if err := netHandle.LinkDel(ifbLink); err != nil { - return fmt.Errorf("Could not remove ifb interface: %v", err) - } - - return nil - }); err != nil { - return err - } - - return nil -} diff --git a/src/runtime/virtcontainers/network_linux.go b/src/runtime/virtcontainers/network_linux.go new file mode 100644 index 0000000000..4b6c7cb264 --- /dev/null +++ b/src/runtime/virtcontainers/network_linux.go @@ -0,0 +1,1403 @@ +// Copyright (c) 2016 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +package virtcontainers + +import ( + "context" + "fmt" + "math/rand" + "net" + "os" + "os/exec" + "runtime" + "sort" + "time" + + "github.com/containernetworking/plugins/pkg/ns" + "github.com/vishvananda/netlink" + "github.com/vishvananda/netns" + otelTrace "go.opentelemetry.io/otel/trace" + "golang.org/x/sys/unix" + + "github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace" + persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils" +) + +// Introduces constants related to networking +const ( + defaultFilePerms = 0600 + defaultQlen = 1500 +) + +// LinuxNetwork represents a sandbox networking setup. +type LinuxNetwork struct { + netNSPath string + eps []Endpoint + interworkingModel NetInterworkingModel + netNSCreated bool +} + +// NewNetwork creates a new Linux Network from a NetworkConfig. +// The constructor is overloaded as it can be called with 0 or 1 +// argument. The former is used to create empty networks, mostly +// for unit testing. Passing more than one NetworkConfig pointer +// will make the constructor fail. +func NewNetwork(configs ...*NetworkConfig) (Network, error) { + if len(configs) > 1 { + return nil, fmt.Errorf("Too many network configurations") + } + + // Empty constructor + if len(configs) == 0 { + return &LinuxNetwork{}, nil + } + + config := configs[0] + if config == nil { + return nil, fmt.Errorf("Missing network configuration") + } + + return &LinuxNetwork{ + config.NetworkID, + []Endpoint{}, + config.InterworkingModel, + config.NetworkCreated, + }, nil +} + +func LoadNetwork(netInfo persistapi.NetworkInfo) Network { + network := LinuxNetwork{ + netNSPath: netInfo.NetworkID, + netNSCreated: netInfo.NetworkCreated, + } + + for _, e := range netInfo.Endpoints { + var ep Endpoint + switch EndpointType(e.Type) { + case PhysicalEndpointType: + ep = &PhysicalEndpoint{} + case VethEndpointType: + ep = &VethEndpoint{} + case VhostUserEndpointType: + ep = &VhostUserEndpoint{} + case MacvlanEndpointType: + ep = &MacvlanEndpoint{} + case MacvtapEndpointType: + ep = &MacvtapEndpoint{} + case TapEndpointType: + ep = &TapEndpoint{} + case IPVlanEndpointType: + ep = &IPVlanEndpoint{} + default: + networkLogger().WithField("endpoint-type", e.Type).Error("unknown endpoint type") + continue + } + ep.load(e) + network.eps = append(network.eps, ep) + } + + return &network +} + +func (n *LinuxNetwork) trace(ctx context.Context, name string) (otelTrace.Span, context.Context) { + return networkTrace(ctx, name, nil) +} + +func (n *LinuxNetwork) AddEndpoint(ctx context.Context, s *Sandbox, netInfo NetworkInfo, link netlink.Link, hotplug bool) (Endpoint, error) { + var endpoint Endpoint + // TODO: This is the incoming interface + // based on the incoming interface we should create + // an appropriate EndPoint based on interface type + // This should be a switch + + // Check if interface is a physical interface. Do not create + // tap interface/bridge if it is. + isPhysical, err := isPhysicalIface(netInfo.Iface.Name) + if err != nil { + return nil, err + } + + if isPhysical { + networkLogger().WithField("interface", netInfo.Iface.Name).Info("Physical network interface found") + endpoint, err = createPhysicalEndpoint(netInfo) + } else { + var socketPath string + idx := len(n.eps) + + // Check if this is a dummy interface which has a vhost-user socket associated with it + socketPath, err = vhostUserSocketPath(netInfo) + if err != nil { + return nil, err + } + + if socketPath != "" { + networkLogger().WithField("interface", netInfo.Iface.Name).Info("VhostUser network interface found") + endpoint, err = createVhostUserEndpoint(netInfo, socketPath) + } else if netInfo.Iface.Type == "macvlan" { + networkLogger().Infof("macvlan interface found") + endpoint, err = createMacvlanNetworkEndpoint(idx, netInfo.Iface.Name, n.interworkingModel) + } else if netInfo.Iface.Type == "macvtap" { + networkLogger().Infof("macvtap interface found") + endpoint, err = createMacvtapNetworkEndpoint(netInfo) + } else if netInfo.Iface.Type == "tap" { + networkLogger().Info("tap interface found") + endpoint, err = createTapNetworkEndpoint(idx, netInfo.Iface.Name) + } else if netInfo.Iface.Type == "tuntap" { + if link != nil { + switch link.(*netlink.Tuntap).Mode { + case 0: + // mount /sys/class/net to get links + return nil, fmt.Errorf("Network device mode not determined correctly. Mount sysfs in caller") + case 1: + return nil, fmt.Errorf("tun networking device not yet supported") + case 2: + networkLogger().Info("tuntap tap interface found") + endpoint, err = createTuntapNetworkEndpoint(idx, netInfo.Iface.Name, netInfo.Iface.HardwareAddr, n.interworkingModel) + default: + return nil, fmt.Errorf("tuntap network %v mode unsupported", link.(*netlink.Tuntap).Mode) + } + } + } else if netInfo.Iface.Type == "veth" { + networkLogger().Info("veth interface found") + endpoint, err = createVethNetworkEndpoint(idx, netInfo.Iface.Name, n.interworkingModel) + } else if netInfo.Iface.Type == "ipvlan" { + networkLogger().Info("ipvlan interface found") + endpoint, err = createIPVlanNetworkEndpoint(idx, netInfo.Iface.Name) + } else { + return nil, fmt.Errorf("Unsupported network interface: %s", netInfo.Iface.Type) + } + } + + if err != nil { + return nil, err + } + + endpoint.SetProperties(netInfo) + + if err := doNetNS(n.netNSPath, func(_ ns.NetNS) error { + networkLogger().WithField("endpoint-type", endpoint.Type()).WithField("hotplug", hotplug).Info("Attaching endpoint") + if hotplug { + if err := endpoint.HotAttach(ctx, s.hypervisor); err != nil { + return err + } + } else { + if err := endpoint.Attach(ctx, s); err != nil { + return err + } + } + + if !s.hypervisor.IsRateLimiterBuiltin() { + rxRateLimiterMaxRate := s.hypervisor.HypervisorConfig().RxRateLimiterMaxRate + if rxRateLimiterMaxRate > 0 { + networkLogger().Info("Add Rx Rate Limiter") + if err := addRxRateLimiter(endpoint, rxRateLimiterMaxRate); err != nil { + return err + } + } + txRateLimiterMaxRate := s.hypervisor.HypervisorConfig().TxRateLimiterMaxRate + if txRateLimiterMaxRate > 0 { + networkLogger().Info("Add Tx Rate Limiter") + if err := addTxRateLimiter(endpoint, txRateLimiterMaxRate); err != nil { + return err + } + } + } + + return nil + }); err != nil { + return nil, err + } + + n.eps = append(n.eps, endpoint) + + return endpoint, nil +} + +func (n *LinuxNetwork) RemoveEndpoint(ctx context.Context, s *Sandbox, idx int, hotplug bool) error { + if idx > len(n.eps)-1 { + return fmt.Errorf("Endpoint index overflow") + } + + endpoint := n.eps[idx] + + if endpoint.GetRxRateLimiter() { + networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Deleting rx rate limiter") + // Deleting rx rate limiter should enter the network namespace. + if err := removeRxRateLimiter(endpoint, n.netNSPath); err != nil { + return err + } + } + + if endpoint.GetTxRateLimiter() { + networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Deleting tx rate limiter") + // Deleting tx rate limiter should enter the network namespace. + if err := removeTxRateLimiter(endpoint, n.netNSPath); err != nil { + return err + } + } + + // Detach for an endpoint should enter the network namespace + // if required. + networkLogger().WithField("endpoint-type", endpoint.Type()).Info("Detaching endpoint") + if hotplug && s != nil { + if err := endpoint.HotDetach(ctx, s.hypervisor, n.netNSCreated, n.netNSPath); err != nil { + return err + } + } else { + if err := endpoint.Detach(ctx, n.netNSCreated, n.netNSPath); err != nil { + return err + } + } + + n.eps = append(n.eps[:idx], n.eps[idx+1:]...) + + return nil +} + +// Scan the networking namespace through netlink and then: +// 1. Create the endpoints for the relevant interfaces found there. +// 2. Attach them to the VM. +func (n *LinuxNetwork) attachEndpoints(ctx context.Context, s *Sandbox, hotplug bool) error { + netnsHandle, err := netns.GetFromPath(n.netNSPath) + if err != nil { + return err + } + defer netnsHandle.Close() + + netlinkHandle, err := netlink.NewHandleAt(netnsHandle) + if err != nil { + return err + } + defer netlinkHandle.Close() + + linkList, err := netlinkHandle.LinkList() + if err != nil { + return err + } + + for _, link := range linkList { + netInfo, err := networkInfoFromLink(netlinkHandle, link) + if err != nil { + return err + } + + // Ignore unconfigured network interfaces. These are + // either base tunnel devices that are not namespaced + // like gre0, gretap0, sit0, ipip0, tunl0 or incorrectly + // setup interfaces. + if len(netInfo.Addrs) == 0 { + continue + } + + // Skip any loopback interfaces: + if (netInfo.Iface.Flags & net.FlagLoopback) != 0 { + continue + } + + _, err = n.AddEndpoint(ctx, s, netInfo, link, hotplug) + if err != nil { + return err + } + } + + sort.Slice(n.eps, func(i, j int) bool { + return n.eps[i].Name() < n.eps[j].Name() + }) + + networkLogger().WithField("endpoints", n.eps).Info("endpoints found after scan") + + return nil +} + +// Run runs a callback in the specified network namespace. +func (n *LinuxNetwork) Run(ctx context.Context, cb func() error) error { + span, _ := n.trace(ctx, "Run") + defer span.End() + + return doNetNS(n.netNSPath, func(_ ns.NetNS) error { + return cb() + }) +} + +// Add adds all needed interfaces inside the network namespace. +func (n *LinuxNetwork) Add(ctx context.Context, s *Sandbox, hotplug bool) error { + span, ctx := n.trace(ctx, "Add") + katatrace.AddTags(span, "type", n.interworkingModel.GetModel()) + defer span.End() + + if err := n.attachEndpoints(ctx, s, hotplug); err != nil { + return err + } + + katatrace.AddTags(span, "endpoints", n.eps, "hotplug", hotplug) + networkLogger().Debug("Network added") + + return nil +} + +func (n *LinuxNetwork) PostAdd(ctx context.Context, hotplug bool) error { + if hotplug { + return nil + } + + if n.eps == nil { + return nil + } + + endpoints := n.eps + + for _, endpoint := range endpoints { + netPair := endpoint.NetworkPair() + if netPair == nil { + continue + } + if netPair.VhostFds != nil { + for _, VhostFd := range netPair.VhostFds { + VhostFd.Close() + } + } + } + + return nil +} + +// Remove network endpoints in the network namespace. It also deletes the network +// namespace in case the namespace has been created by us. +func (n *LinuxNetwork) Remove(ctx context.Context) error { + span, ctx := n.trace(ctx, "Remove") + defer span.End() + + for i := range n.eps { + if err := n.RemoveEndpoint(ctx, nil, i, false); err != nil { + return err + } + } + + networkLogger().Debug("Network removed") + + if n.netNSCreated { + networkLogger().Infof("Network namespace %q deleted", n.netNSPath) + return deleteNetNS(n.netNSPath) + } + + return nil +} + +// Network getters +func (n *LinuxNetwork) NetworkID() string { + return n.netNSPath +} + +func (n *LinuxNetwork) NetworkCreated() bool { + return n.netNSCreated +} + +func (n *LinuxNetwork) Endpoints() []Endpoint { + return n.eps +} + +func (n *LinuxNetwork) SetEndpoints(endpoints []Endpoint) { + n.eps = endpoints +} + +func createLink(netHandle *netlink.Handle, name string, expectedLink netlink.Link, queues int) (netlink.Link, []*os.File, error) { + var newLink netlink.Link + var fds []*os.File + + switch expectedLink.Type() { + case (&netlink.Tuntap{}).Type(): + flags := netlink.TUNTAP_VNET_HDR + if queues > 0 { + flags |= netlink.TUNTAP_MULTI_QUEUE_DEFAULTS + } + newLink = &netlink.Tuntap{ + LinkAttrs: netlink.LinkAttrs{Name: name}, + Mode: netlink.TUNTAP_MODE_TAP, + Queues: queues, + Flags: flags, + } + case (&netlink.Macvtap{}).Type(): + qlen := expectedLink.Attrs().TxQLen + if qlen <= 0 { + qlen = defaultQlen + } + newLink = &netlink.Macvtap{ + Macvlan: netlink.Macvlan{ + Mode: netlink.MACVLAN_MODE_BRIDGE, + LinkAttrs: netlink.LinkAttrs{ + Index: expectedLink.Attrs().Index, + Name: name, + TxQLen: qlen, + ParentIndex: expectedLink.Attrs().ParentIndex, + }, + }, + } + default: + return nil, fds, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) + } + + if err := netHandle.LinkAdd(newLink); err != nil { + return nil, fds, fmt.Errorf("LinkAdd() failed for %s name %s: %s", expectedLink.Type(), name, err) + } + + tuntapLink, ok := newLink.(*netlink.Tuntap) + if ok { + fds = tuntapLink.Fds + } + + newLink, err := getLinkByName(netHandle, name, expectedLink) + return newLink, fds, err +} + +func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.Link, error) { + var link netlink.Link + + switch ep := endpoint.(type) { + case *VethEndpoint: + link = &netlink.Veth{} + case *MacvlanEndpoint: + link = &netlink.Macvlan{} + case *IPVlanEndpoint: + link = &netlink.IPVlan{} + case *TuntapEndpoint: + link = &netlink.Tuntap{} + default: + return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type()) + } + + return getLinkByName(netHandle, endpoint.NetworkPair().VirtIface.Name, link) +} + +func getLinkByName(netHandle *netlink.Handle, name string, expectedLink netlink.Link) (netlink.Link, error) { + link, err := netHandle.LinkByName(name) + if err != nil { + return nil, fmt.Errorf("LinkByName() failed for %s name %s: %s", expectedLink.Type(), name, err) + } + + switch expectedLink.Type() { + case (&netlink.Tuntap{}).Type(): + if l, ok := link.(*netlink.Tuntap); ok { + return l, nil + } + case (&netlink.Veth{}).Type(): + if l, ok := link.(*netlink.Veth); ok { + return l, nil + } + case (&netlink.Macvtap{}).Type(): + if l, ok := link.(*netlink.Macvtap); ok { + return l, nil + } + case (&netlink.Macvlan{}).Type(): + if l, ok := link.(*netlink.Macvlan); ok { + return l, nil + } + case (&netlink.IPVlan{}).Type(): + if l, ok := link.(*netlink.IPVlan); ok { + return l, nil + } + default: + return nil, fmt.Errorf("Unsupported link type %s", expectedLink.Type()) + } + + return nil, fmt.Errorf("Incorrect link type %s, expecting %s", link.Type(), expectedLink.Type()) +} + +// The endpoint type should dictate how the connection needs to happen. +func xConnectVMNetwork(ctx context.Context, endpoint Endpoint, h Hypervisor) error { + var err error + + span, ctx := networkTrace(ctx, "xConnectVMNetwork", endpoint) + defer closeSpan(span, err) + + netPair := endpoint.NetworkPair() + + queues := 0 + caps := h.Capabilities(ctx) + if caps.IsMultiQueueSupported() { + queues = int(h.HypervisorConfig().NumVCPUs) + } + + disableVhostNet := h.HypervisorConfig().DisableVhostNet + + if netPair.NetInterworkingModel == NetXConnectDefaultModel { + netPair.NetInterworkingModel = DefaultNetInterworkingModel + } + + switch netPair.NetInterworkingModel { + case NetXConnectMacVtapModel: + networkLogger().Info("connect macvtap to VM network") + err = tapNetworkPair(ctx, endpoint, queues, disableVhostNet) + case NetXConnectTCFilterModel: + networkLogger().Info("connect TCFilter to VM network") + err = setupTCFiltering(ctx, endpoint, queues, disableVhostNet) + default: + err = fmt.Errorf("Invalid internetworking model") + } + return err +} + +// The endpoint type should dictate how the disconnection needs to happen. +func xDisconnectVMNetwork(ctx context.Context, endpoint Endpoint) error { + var err error + + span, ctx := networkTrace(ctx, "xDisconnectVMNetwork", endpoint) + defer closeSpan(span, err) + + netPair := endpoint.NetworkPair() + + if netPair.NetInterworkingModel == NetXConnectDefaultModel { + netPair.NetInterworkingModel = DefaultNetInterworkingModel + } + + switch netPair.NetInterworkingModel { + case NetXConnectMacVtapModel: + err = untapNetworkPair(ctx, endpoint) + case NetXConnectTCFilterModel: + err = removeTCFiltering(ctx, endpoint) + default: + err = fmt.Errorf("Invalid internetworking model") + } + return err +} + +func createMacvtapFds(linkIndex int, queues int) ([]*os.File, error) { + tapDev := fmt.Sprintf("/dev/tap%d", linkIndex) + return createFds(tapDev, queues) +} + +func createVhostFds(numFds int) ([]*os.File, error) { + vhostDev := "/dev/vhost-net" + return createFds(vhostDev, numFds) +} + +func createFds(device string, numFds int) ([]*os.File, error) { + fds := make([]*os.File, numFds) + + for i := 0; i < numFds; i++ { + f, err := os.OpenFile(device, os.O_RDWR, defaultFilePerms) + if err != nil { + utils.CleanupFds(fds, i) + return nil, err + } + fds[i] = f + } + return fds, nil +} + +// There is a limitation in the linux kernel that prevents a macvtap/macvlan link +// from getting the correct link index when created in a network namespace +// https://github.com/clearcontainers/runtime/issues/708 +// +// Till that bug is fixed we need to pick a random non conflicting index and try to +// create a link. If that fails, we need to try with another. +// All the kernel does not Check if the link id conflicts with a link id on the host +// hence we need to offset the link id to prevent any overlaps with the host index +// +// Here the kernel will ensure that there is no race condition + +const hostLinkOffset = 8192 // Host should not have more than 8k interfaces +const linkRange = 0xFFFF // This will allow upto 2^16 containers +const linkRetries = 128 // The numbers of time we try to find a non conflicting index +const macvtapWorkaround = true + +func createMacVtap(netHandle *netlink.Handle, name string, link netlink.Link, queues int) (taplink netlink.Link, err error) { + + if !macvtapWorkaround { + taplink, _, err = createLink(netHandle, name, link, queues) + return + } + + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for i := 0; i < linkRetries; i++ { + index := hostLinkOffset + (r.Int() & linkRange) + link.Attrs().Index = index + taplink, _, err = createLink(netHandle, name, link, queues) + if err == nil { + break + } + } + + return +} + +func clearIPs(link netlink.Link, addrs []netlink.Addr) error { + for _, addr := range addrs { + if err := netlink.AddrDel(link, &addr); err != nil { + return err + } + } + return nil +} + +func setIPs(link netlink.Link, addrs []netlink.Addr) error { + for _, addr := range addrs { + if err := netlink.AddrAdd(link, &addr); err != nil { + return err + } + } + return nil +} + +func tapNetworkPair(ctx context.Context, endpoint Endpoint, queues int, disableVhostNet bool) error { + span, _ := networkTrace(ctx, "tapNetworkPair", endpoint) + defer span.End() + + netHandle, err := netlink.NewHandle() + if err != nil { + return err + } + defer netHandle.Close() + + netPair := endpoint.NetworkPair() + + link, err := getLinkForEndpoint(endpoint, netHandle) + if err != nil { + return err + } + + attrs := link.Attrs() + + // Attach the macvtap interface to the underlying container + // interface. Also picks relevant attributes from the parent + tapLink, err := createMacVtap(netHandle, netPair.TAPIface.Name, + &netlink.Macvtap{ + Macvlan: netlink.Macvlan{ + LinkAttrs: netlink.LinkAttrs{ + TxQLen: attrs.TxQLen, + ParentIndex: attrs.Index, + }, + }, + }, queues) + + if err != nil { + return fmt.Errorf("Could not create TAP interface: %s", err) + } + + // Save the veth MAC address to the TAP so that it can later be used + // to build the hypervisor command line. This MAC address has to be + // the one inside the VM in order to avoid any firewall issues. The + // bridge created by the network plugin on the host actually expects + // to see traffic from this MAC address and not another one. + tapHardAddr := attrs.HardwareAddr + netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() + + if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { + return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) + } + + hardAddr, err := net.ParseMAC(netPair.VirtIface.HardAddr) + if err != nil { + return err + } + if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { + return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", + netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) + } + + if err := netHandle.LinkSetHardwareAddr(tapLink, tapHardAddr); err != nil { + return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", + netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) + } + + if err := netHandle.LinkSetUp(tapLink); err != nil { + return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) + } + + // Clear the IP addresses from the veth interface to prevent ARP conflict + netPair.VirtIface.Addrs, err = netlink.AddrList(link, netlink.FAMILY_ALL) + if err != nil { + return fmt.Errorf("Unable to obtain veth IP addresses: %s", err) + } + + if err := clearIPs(link, netPair.VirtIface.Addrs); err != nil { + return fmt.Errorf("Unable to clear veth IP addresses: %s", err) + } + + if err := netHandle.LinkSetUp(link); err != nil { + return fmt.Errorf("Could not enable veth %s: %s", netPair.VirtIface.Name, err) + } + + // Note: The underlying interfaces need to be up prior to fd creation. + + netPair.VMFds, err = createMacvtapFds(tapLink.Attrs().Index, queues) + if err != nil { + return fmt.Errorf("Could not setup macvtap fds %s: %s", netPair.TAPIface, err) + } + + if !disableVhostNet { + vhostFds, err := createVhostFds(queues) + if err != nil { + return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) + } + netPair.VhostFds = vhostFds + } + + return nil +} + +func setupTCFiltering(ctx context.Context, endpoint Endpoint, queues int, disableVhostNet bool) error { + span, _ := networkTrace(ctx, "setupTCFiltering", endpoint) + defer span.End() + + netHandle, err := netlink.NewHandle() + if err != nil { + return err + } + defer netHandle.Close() + + netPair := endpoint.NetworkPair() + + tapLink, fds, err := createLink(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}, queues) + if err != nil { + return fmt.Errorf("Could not create TAP interface: %s", err) + } + netPair.VMFds = fds + + if !disableVhostNet { + vhostFds, err := createVhostFds(queues) + if err != nil { + return fmt.Errorf("Could not setup vhost fds %s : %s", netPair.VirtIface.Name, err) + } + netPair.VhostFds = vhostFds + } + + var attrs *netlink.LinkAttrs + var link netlink.Link + + link, err = getLinkForEndpoint(endpoint, netHandle) + if err != nil { + return err + } + + attrs = link.Attrs() + + // Save the veth MAC address to the TAP so that it can later be used + // to build the Hypervisor command line. This MAC address has to be + // the one inside the VM in order to avoid any firewall issues. The + // bridge created by the network plugin on the host actually expects + // to see traffic from this MAC address and not another one. + netPair.TAPIface.HardAddr = attrs.HardwareAddr.String() + + if err := netHandle.LinkSetMTU(tapLink, attrs.MTU); err != nil { + return fmt.Errorf("Could not set TAP MTU %d: %s", attrs.MTU, err) + } + + if err := netHandle.LinkSetUp(tapLink); err != nil { + return fmt.Errorf("Could not enable TAP %s: %s", netPair.TAPIface.Name, err) + } + + tapAttrs := tapLink.Attrs() + + if err := addQdiscIngress(tapAttrs.Index); err != nil { + return err + } + + if err := addQdiscIngress(attrs.Index); err != nil { + return err + } + + if err := addRedirectTCFilter(attrs.Index, tapAttrs.Index); err != nil { + return err + } + + if err := addRedirectTCFilter(tapAttrs.Index, attrs.Index); err != nil { + return err + } + + return nil +} + +// addQdiscIngress creates a new qdisc for network interface with the specified network index +// on "ingress". qdiscs normally don't work on ingress so this is really a special qdisc +// that you can consider an "alternate root" for inbound packets. +// Handle for ingress qdisc defaults to "ffff:" +// +// This is equivalent to calling `tc qdisc add dev eth0 ingress` +func addQdiscIngress(index int) error { + qdisc := &netlink.Ingress{ + QdiscAttrs: netlink.QdiscAttrs{ + LinkIndex: index, + Parent: netlink.HANDLE_INGRESS, + }, + } + + err := netlink.QdiscAdd(qdisc) + if err != nil { + return fmt.Errorf("Failed to add qdisc for network index %d : %s", index, err) + } + + return nil +} + +// addRedirectTCFilter adds a tc filter for device with index "sourceIndex". +// All traffic for interface with index "sourceIndex" is redirected to interface with +// index "destIndex" +// +// This is equivalent to calling: +// `tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev dest` +func addRedirectTCFilter(sourceIndex, destIndex int) error { + filter := &netlink.U32{ + FilterAttrs: netlink.FilterAttrs{ + LinkIndex: sourceIndex, + Parent: netlink.MakeHandle(0xffff, 0), + Protocol: unix.ETH_P_ALL, + }, + Actions: []netlink.Action{ + &netlink.MirredAction{ + ActionAttrs: netlink.ActionAttrs{ + Action: netlink.TC_ACT_STOLEN, + }, + MirredAction: netlink.TCA_EGRESS_REDIR, + Ifindex: destIndex, + }, + }, + } + + if err := netlink.FilterAdd(filter); err != nil { + return fmt.Errorf("Failed to add filter for index %d : %s", sourceIndex, err) + } + + return nil +} + +// removeRedirectTCFilter removes all tc u32 filters created on ingress qdisc for "link". +func removeRedirectTCFilter(link netlink.Link) error { + if link == nil { + return nil + } + + // Handle 0xffff is used for ingress + filters, err := netlink.FilterList(link, netlink.MakeHandle(0xffff, 0)) + if err != nil { + return err + } + + for _, f := range filters { + u32, ok := f.(*netlink.U32) + + if !ok { + continue + } + + if err := netlink.FilterDel(u32); err != nil { + return err + } + } + return nil +} + +// removeQdiscIngress removes the ingress qdisc previously created on "link". +func removeQdiscIngress(link netlink.Link) error { + if link == nil { + return nil + } + + qdiscs, err := netlink.QdiscList(link) + if err != nil { + return err + } + + for _, qdisc := range qdiscs { + ingress, ok := qdisc.(*netlink.Ingress) + if !ok { + continue + } + + if err := netlink.QdiscDel(ingress); err != nil { + return err + } + } + return nil +} + +func untapNetworkPair(ctx context.Context, endpoint Endpoint) error { + span, _ := networkTrace(ctx, "untapNetworkPair", endpoint) + defer span.End() + + netHandle, err := netlink.NewHandle() + if err != nil { + return err + } + defer netHandle.Close() + + netPair := endpoint.NetworkPair() + + tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Macvtap{}) + if err != nil { + return fmt.Errorf("Could not get TAP interface %s: %s", netPair.TAPIface.Name, err) + } + + if err := netHandle.LinkDel(tapLink); err != nil { + return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) + } + + link, err := getLinkForEndpoint(endpoint, netHandle) + if err != nil { + return err + } + + hardAddr, err := net.ParseMAC(netPair.TAPIface.HardAddr) + if err != nil { + return err + } + if err := netHandle.LinkSetHardwareAddr(link, hardAddr); err != nil { + return fmt.Errorf("Could not set MAC address %s for veth interface %s: %s", + netPair.VirtIface.HardAddr, netPair.VirtIface.Name, err) + } + + if err := netHandle.LinkSetDown(link); err != nil { + return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) + } + + // Restore the IPs that were cleared + err = setIPs(link, netPair.VirtIface.Addrs) + return err +} + +func removeTCFiltering(ctx context.Context, endpoint Endpoint) error { + span, _ := networkTrace(ctx, "removeTCFiltering", endpoint) + defer span.End() + + netHandle, err := netlink.NewHandle() + if err != nil { + return err + } + defer netHandle.Close() + + netPair := endpoint.NetworkPair() + + tapLink, err := getLinkByName(netHandle, netPair.TAPIface.Name, &netlink.Tuntap{}) + if err != nil { + return fmt.Errorf("Could not get TAP interface: %s", err) + } + + if err := netHandle.LinkSetDown(tapLink); err != nil { + return fmt.Errorf("Could not disable TAP %s: %s", netPair.TAPIface.Name, err) + } + + if err := netHandle.LinkDel(tapLink); err != nil { + return fmt.Errorf("Could not remove TAP %s: %s", netPair.TAPIface.Name, err) + } + + link, err := getLinkForEndpoint(endpoint, netHandle) + if err != nil { + return err + } + + if err := removeRedirectTCFilter(link); err != nil { + return err + } + + if err := removeQdiscIngress(link); err != nil { + return err + } + + if err := netHandle.LinkSetDown(link); err != nil { + return fmt.Errorf("Could not disable veth %s: %s", netPair.VirtIface.Name, err) + } + + return nil +} + +// doNetNS is free from any call to a go routine, and it calls +// into runtime.LockOSThread(), meaning it won't be executed in a +// different thread than the one expected by the caller. +func doNetNS(netNSPath string, cb func(ns.NetNS) error) error { + // if netNSPath is empty, the callback function will be run in the current network namespace. + // So skip the whole function, just call cb(). cb() needs a NetNS as arg but ignored, give it a fake one. + if netNSPath == "" { + var netNs ns.NetNS + return cb(netNs) + } + + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + currentNS, err := ns.GetCurrentNS() + if err != nil { + return err + } + defer currentNS.Close() + + targetNS, err := ns.GetNS(netNSPath) + if err != nil { + return err + } + + if err := targetNS.Set(); err != nil { + return err + } + defer currentNS.Set() + + return cb(targetNS) +} + +func deleteNetNS(netNSPath string) error { + n, err := ns.GetNS(netNSPath) + if err != nil { + return err + } + + err = n.Close() + if err != nil { + return err + } + + if err = unix.Unmount(netNSPath, unix.MNT_DETACH); err != nil { + return fmt.Errorf("Failed to unmount namespace %s: %v", netNSPath, err) + } + if err := os.RemoveAll(netNSPath); err != nil { + return fmt.Errorf("Failed to clean up namespace %s: %v", netNSPath, err) + } + + return nil +} + +func networkInfoFromLink(handle *netlink.Handle, link netlink.Link) (NetworkInfo, error) { + addrs, err := handle.AddrList(link, netlink.FAMILY_ALL) + if err != nil { + return NetworkInfo{}, err + } + + routes, err := handle.RouteList(link, netlink.FAMILY_ALL) + if err != nil { + return NetworkInfo{}, err + } + + neighbors, err := handle.NeighList(link.Attrs().Index, netlink.FAMILY_ALL) + if err != nil { + return NetworkInfo{}, err + } + + return NetworkInfo{ + Iface: NetlinkIface{ + LinkAttrs: *(link.Attrs()), + Type: link.Type(), + }, + Addrs: addrs, + Routes: routes, + Neighbors: neighbors, + }, nil +} + +// func addRxRateLmiter implements tc-based rx rate limiter to control network I/O inbound traffic +// on VM level for hypervisors which don't implement rate limiter in itself, like qemu, etc. +func addRxRateLimiter(endpoint Endpoint, maxRate uint64) error { + var linkName string + switch ep := endpoint.(type) { + case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: + netPair := endpoint.NetworkPair() + linkName = netPair.TapInterface.TAPIface.Name + case *MacvtapEndpoint, *TapEndpoint: + linkName = endpoint.Name() + default: + return fmt.Errorf("Unsupported endpointType %s for adding rx rate limiter", ep.Type()) + } + + if err := endpoint.SetRxRateLimiter(); err != nil { + return nil + } + + link, err := netlink.LinkByName(linkName) + if err != nil { + return err + } + linkIndex := link.Attrs().Index + + return addHTBQdisc(linkIndex, maxRate) +} + +// func addHTBQdisc uses HTB(Hierarchical Token Bucket) qdisc shaping schemes to control interface traffic. +// HTB (Hierarchical Token Bucket) shapes traffic based on the Token Bucket Filter algorithm. +// A fundamental part of the HTB qdisc is the borrowing mechanism. Children classes borrow tokens +// from their parents once they have exceeded rate. A child class will continue to attempt to borrow until +// it reaches ceil. See more details in https://tldp.org/HOWTO/Traffic-Control-HOWTO/classful-qdiscs.html. +// +// * +-----+ +---------+ +-----------+ +-----------+ +// * | | | qdisc | | class 1:1 | | class 1:2 | +// * | NIC | | htb | | rate | | rate | +// * | | --> | def 1:2 | --> | ceil | -+-> | ceil | +// * +-----+ +---------+ +-----------+ | +-----------+ +// * | +// * | +-----------+ +// * | | class 1:n | +// * | | rate | +// * +-> | ceil | +// * | +-----------+ +// Seeing from pic, after the routing decision, all packets will be sent to the interface root htb qdisc. +// This root qdisc has only one direct child class (with id 1:1) which shapes the overall maximum rate +// that will be sent through interface. Then, this class has at least one default child (1:2) meant to control all +// non-privileged traffic. +// e.g. +// if we try to set VM bandwidth with maximum 10Mbit/s, we should give +// classid 1:2 rate 10Mbit/s, ceil 10Mbit/s and classid 1:1 rate 10Mbit/s, ceil 10Mbit/s. +// To-do: +// Later, if we want to do limitation on some dedicated traffic(special process running in VM), we could create +// a separate class (1:n) with guarantee throughput. +func addHTBQdisc(linkIndex int, maxRate uint64) error { + // we create a new htb root qdisc for network interface with the specified network index + qdiscAttrs := netlink.QdiscAttrs{ + LinkIndex: linkIndex, + Handle: netlink.MakeHandle(1, 0), + Parent: netlink.HANDLE_ROOT, + } + qdisc := netlink.NewHtb(qdiscAttrs) + // all non-privileged traffic go to classid 1:2. + qdisc.Defcls = 2 + + err := netlink.QdiscAdd(qdisc) + if err != nil { + return fmt.Errorf("Failed to add htb qdisc: %v", err) + } + + // root htb qdisc has only one direct child class (with id 1:1) to control overall rate. + classAttrs := netlink.ClassAttrs{ + LinkIndex: linkIndex, + Parent: netlink.MakeHandle(1, 0), + Handle: netlink.MakeHandle(1, 1), + } + htbClassAttrs := netlink.HtbClassAttrs{ + Rate: maxRate, + Ceil: maxRate, + } + class := netlink.NewHtbClass(classAttrs, htbClassAttrs) + if err := netlink.ClassAdd(class); err != nil { + return fmt.Errorf("Failed to add htb classid 1:1 : %v", err) + } + + // above class has at least one default child class(1:2) for all non-privileged traffic. + classAttrs = netlink.ClassAttrs{ + LinkIndex: linkIndex, + Parent: netlink.MakeHandle(1, 1), + Handle: netlink.MakeHandle(1, 2), + } + htbClassAttrs = netlink.HtbClassAttrs{ + Rate: maxRate, + Ceil: maxRate, + } + class = netlink.NewHtbClass(classAttrs, htbClassAttrs) + if err := netlink.ClassAdd(class); err != nil { + return fmt.Errorf("Failed to add htb class 1:2 : %v", err) + } + + return nil +} + +// The Intermediate Functional Block (ifb) pseudo network interface is an alternative +// to tc filters for handling ingress traffic, +// By redirecting interface ingress traffic to ifb and treat it as egress traffic there, +// we could do network shaping to interface inbound traffic. +func addIFBDevice() (int, error) { + // Check whether host supports ifb + if ok, err := utils.SupportsIfb(); !ok { + return -1, err + } + + netHandle, err := netlink.NewHandle() + if err != nil { + return -1, err + } + defer netHandle.Close() + + // There exists error when using netlink library to create ifb interface + cmd := exec.Command("ip", "link", "add", "dev", "ifb0", "type", "ifb") + if output, err := cmd.CombinedOutput(); err != nil { + return -1, fmt.Errorf("Could not create link ifb0: %v, error %v", output, err) + } + + ifbLink, err := netlink.LinkByName("ifb0") + if err != nil { + return -1, err + } + + if err := netHandle.LinkSetUp(ifbLink); err != nil { + return -1, fmt.Errorf("Could not enable link ifb0 %v", err) + } + + return ifbLink.Attrs().Index, nil +} + +// This is equivalent to calling: +// tc filter add dev source parent ffff: protocol all u32 match u8 0 0 action mirred egress redirect dev ifb +func addIFBRedirecting(sourceIndex int, ifbIndex int) error { + if err := addQdiscIngress(sourceIndex); err != nil { + return err + } + + if err := addRedirectTCFilter(sourceIndex, ifbIndex); err != nil { + return err + } + + return nil +} + +// addTxRateLimiter implements tx rate limiter to control network I/O outbound traffic +// on VM level for hypervisors which don't implement rate limiter in itself, like qemu, etc. +// We adopt different actions, based on different inter-networking models. +// For tcfilters as inter-networking model, we simply apply htb qdisc discipline to the virtual netpair. +// For other inter-networking models, such as macvtap, we resort to ifb, by redirecting endpoint ingress traffic +// to ifb egress, and then apply htb to ifb egress. +func addTxRateLimiter(endpoint Endpoint, maxRate uint64) error { + var netPair *NetworkInterfacePair + var linkName string + switch ep := endpoint.(type) { + case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: + netPair = endpoint.NetworkPair() + switch netPair.NetInterworkingModel { + // For those endpoints we've already used tcfilter as their inter-networking model, + // another ifb redirect will be redundant and confused. + case NetXConnectTCFilterModel: + linkName = netPair.VirtIface.Name + link, err := netlink.LinkByName(linkName) + if err != nil { + return err + } + return addHTBQdisc(link.Attrs().Index, maxRate) + case NetXConnectMacVtapModel, NetXConnectNoneModel: + linkName = netPair.TapInterface.TAPIface.Name + default: + return fmt.Errorf("Unsupported inter-networking model %v for adding tx rate limiter", netPair.NetInterworkingModel) + } + + case *MacvtapEndpoint, *TapEndpoint: + linkName = endpoint.Name() + default: + return fmt.Errorf("Unsupported endpointType %s for adding tx rate limiter", ep.Type()) + } + + if err := endpoint.SetTxRateLimiter(); err != nil { + return err + } + + ifbIndex, err := addIFBDevice() + if err != nil { + return err + } + + link, err := netlink.LinkByName(linkName) + if err != nil { + return err + } + + if err := addIFBRedirecting(link.Attrs().Index, ifbIndex); err != nil { + return err + } + + return addHTBQdisc(ifbIndex, maxRate) +} + +func removeHTBQdisc(linkName string) error { + link, err := netlink.LinkByName(linkName) + if err != nil { + return fmt.Errorf("Get link %s by name failed: %v", linkName, err) + } + + qdiscs, err := netlink.QdiscList(link) + if err != nil { + return err + } + + for _, qdisc := range qdiscs { + htb, ok := qdisc.(*netlink.Htb) + if !ok { + continue + } + + if err := netlink.QdiscDel(htb); err != nil { + return fmt.Errorf("Failed to delete htb qdisc on link %s: %v", linkName, err) + } + } + + return nil +} + +func removeRxRateLimiter(endpoint Endpoint, networkNSPath string) error { + var linkName string + switch ep := endpoint.(type) { + case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: + netPair := endpoint.NetworkPair() + linkName = netPair.TapInterface.TAPIface.Name + case *MacvtapEndpoint, *TapEndpoint: + linkName = endpoint.Name() + default: + return fmt.Errorf("Unsupported endpointType %s for removing rx rate limiter", ep.Type()) + } + + if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { + return removeHTBQdisc(linkName) + }); err != nil { + return err + } + + return nil +} + +func removeTxRateLimiter(endpoint Endpoint, networkNSPath string) error { + var linkName string + switch ep := endpoint.(type) { + case *VethEndpoint, *IPVlanEndpoint, *TuntapEndpoint, *MacvlanEndpoint: + netPair := endpoint.NetworkPair() + switch netPair.NetInterworkingModel { + case NetXConnectTCFilterModel: + linkName = netPair.VirtIface.Name + if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { + return removeHTBQdisc(linkName) + }); err != nil { + return err + } + return nil + case NetXConnectMacVtapModel, NetXConnectNoneModel: + linkName = netPair.TapInterface.TAPIface.Name + } + case *MacvtapEndpoint, *TapEndpoint: + linkName = endpoint.Name() + default: + return fmt.Errorf("Unsupported endpointType %s for adding tx rate limiter", ep.Type()) + } + + if err := doNetNS(networkNSPath, func(_ ns.NetNS) error { + link, err := netlink.LinkByName(linkName) + if err != nil { + return fmt.Errorf("Get link %s by name failed: %v", linkName, err) + } + + if err := removeRedirectTCFilter(link); err != nil { + return err + } + + if err := removeQdiscIngress(link); err != nil { + return err + } + + netHandle, err := netlink.NewHandle() + if err != nil { + return err + } + defer netHandle.Close() + + // remove ifb interface + ifbLink, err := netlink.LinkByName("ifb0") + if err != nil { + return fmt.Errorf("Get link %s by name failed: %v", linkName, err) + } + + if err := netHandle.LinkSetDown(ifbLink); err != nil { + return fmt.Errorf("Could not disable ifb interface: %v", err) + } + + if err := netHandle.LinkDel(ifbLink); err != nil { + return fmt.Errorf("Could not remove ifb interface: %v", err) + } + + return nil + }); err != nil { + return err + } + + return nil +} diff --git a/src/runtime/virtcontainers/network_test.go b/src/runtime/virtcontainers/network_test.go index ac0476f334..a0dfc0a70c 100644 --- a/src/runtime/virtcontainers/network_test.go +++ b/src/runtime/virtcontainers/network_test.go @@ -75,7 +75,7 @@ func TestGenerateInterfacesAndRoutes(t *testing.T) { nns, err := NewNetwork(&NetworkConfig{NetworkID: "foobar", NetworkCreated: true}) assert.Nil(t, err) - nns.eps = endpoints + nns.SetEndpoints(endpoints) resInterfaces, resRoutes, resNeighs, err := generateVCNetworkStructures(context.Background(), nns) diff --git a/src/runtime/virtcontainers/qemu.go b/src/runtime/virtcontainers/qemu.go index 9f5fdb2959..7b11287cd1 100644 --- a/src/runtime/virtcontainers/qemu.go +++ b/src/runtime/virtcontainers/qemu.go @@ -468,7 +468,7 @@ func (q *qemu) setConfig(config *HypervisorConfig) error { } // CreateVM is the Hypervisor VM creation implementation for govmmQemu. -func (q *qemu) CreateVM(ctx context.Context, id string, network *Network, hypervisorConfig *HypervisorConfig) error { +func (q *qemu) CreateVM(ctx context.Context, id string, network Network, hypervisorConfig *HypervisorConfig) error { // Save the tracing context q.ctx = ctx diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go index 20d815cea7..f9341e70e0 100644 --- a/src/runtime/virtcontainers/sandbox.go +++ b/src/runtime/virtcontainers/sandbox.go @@ -207,7 +207,7 @@ type Sandbox struct { id string - network *Network + network Network state types.SandboxState @@ -873,14 +873,14 @@ func (s *Sandbox) AddInterface(ctx context.Context, inf *pbTypes.Interface) (*pb return nil, err } - endpoint, err := s.network.attachEndpoint(ctx, s, netInfo, nil, true) + endpoint, err := s.network.AddEndpoint(ctx, s, netInfo, nil, true) if err != nil { return nil, err } defer func() { if err != nil { - if errDetach := s.network.detachEndpoint(ctx, s, len(s.network.Endpoints())-1, true); err != nil { + if errDetach := s.network.RemoveEndpoint(ctx, s, len(s.network.Endpoints())-1, true); err != nil { s.Logger().WithField("endpoint-type", endpoint.Type()).WithError(errDetach).Error("rollback hot attaching endpoint failed") } } @@ -906,7 +906,7 @@ func (s *Sandbox) RemoveInterface(ctx context.Context, inf *pbTypes.Interface) ( for i, endpoint := range s.network.Endpoints() { if endpoint.HardwareAddr() == inf.HwAddr { s.Logger().WithField("endpoint-type", endpoint.Type()).Info("Hot detaching endpoint") - if err := s.network.detachEndpoint(ctx, s, i, true); err != nil { + if err := s.network.RemoveEndpoint(ctx, s, i, true); err != nil { return inf, err }