diff --git a/pkg/proxy/nftables/proxier.go b/pkg/proxy/nftables/proxier.go index 0ba4a437363..12c5e2fbd96 100644 --- a/pkg/proxy/nftables/proxier.go +++ b/pkg/proxy/nftables/proxier.go @@ -29,6 +29,7 @@ import ( "encoding/base32" "fmt" "net" + "os" "reflect" "strconv" "strings" @@ -40,6 +41,7 @@ import ( discovery "k8s.io/api/discovery/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apimachinery/pkg/util/version" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/events" "k8s.io/klog/v2" @@ -50,6 +52,7 @@ import ( "k8s.io/kubernetes/pkg/proxy/metrics" proxyutil "k8s.io/kubernetes/pkg/proxy/util" "k8s.io/kubernetes/pkg/util/async" + utilkernel "k8s.io/kubernetes/pkg/util/kernel" utilexec "k8s.io/utils/exec" netutils "k8s.io/utils/net" "k8s.io/utils/ptr" @@ -215,6 +218,11 @@ func NewProxier(ctx context.Context, ) (*Proxier, error) { logger := klog.LoggerWithValues(klog.FromContext(ctx), "ipFamily", ipFamily) + nft, err := getNFTablesInterface(ipFamily) + if err != nil { + return nil, err + } + if initOnly { logger.Info("System initialized and --init-only specified") return nil, nil @@ -229,17 +237,6 @@ func NewProxier(ctx context.Context, serviceHealthServer := healthcheck.NewServiceHealthServer(hostname, recorder, nodePortAddresses, healthzServer) - var nftablesFamily knftables.Family - if ipFamily == v1.IPv4Protocol { - nftablesFamily = knftables.IPv4Family - } else { - nftablesFamily = knftables.IPv6Family - } - nft, err := knftables.New(nftablesFamily, kubeProxyTable) - if err != nil { - return nil, err - } - proxier := &Proxier{ ipFamily: ipFamily, svcPortMap: make(proxy.ServicePortMap), @@ -270,6 +267,52 @@ func NewProxier(ctx context.Context, return proxier, nil } +// Create a knftables.Interface and check if we can use the nftables proxy mode on this host. +func getNFTablesInterface(ipFamily v1.IPFamily) (knftables.Interface, error) { + var nftablesFamily knftables.Family + if ipFamily == v1.IPv4Protocol { + nftablesFamily = knftables.IPv4Family + } else { + nftablesFamily = knftables.IPv6Family + } + + // We require (or rather, knftables.New does) that the nft binary be version 1.0.1 + // or later, because versions before that would always attempt to parse the entire + // nft ruleset at startup, even if you were only operating on a single table. + // That's bad, because in some cases, new versions of nft have added new rule + // types in ways that triggered bugs in older versions of nft, causing them to + // crash. Thus, if kube-proxy used nft < 1.0.1, it could potentially get locked + // out of its rules because of something some other component had done in a + // completely different table. + nft, err := knftables.New(nftablesFamily, kubeProxyTable) + if err != nil { + return nil, err + } + + // Likewise, we want to ensure that the host filesystem has nft >= 1.0.1, so that + // it's not possible that *our* rules break *the system's* nft. (In particular, we + // know that if kube-proxy uses nft >= 1.0.3 and the system has nft <= 0.9.8, that + // the system nft will become completely unusable.) Unfortunately, we can't easily + // figure out the version of nft installed on the host filesystem, so instead, we + // check the kernel version, under the assumption that the distro will have an nft + // binary that supports the same features as its kernel does, and so kernel 5.13 + // or later implies nft 1.0.1 or later. https://issues.k8s.io/122743 + // + // However, we allow the user to bypass this check by setting + // `KUBE_PROXY_NFTABLES_SKIP_KERNEL_VERSION_CHECK` to anything non-empty. + if os.Getenv("KUBE_PROXY_NFTABLES_SKIP_KERNEL_VERSION_CHECK") != "" { + kernelVersion, err := utilkernel.GetVersion() + if err != nil { + return nil, fmt.Errorf("could not check kernel version: %w", err) + } + if kernelVersion.LessThan(version.MustParseGeneric(utilkernel.NFTablesKubeProxyKernelVersion)) { + return nil, fmt.Errorf("kube-proxy in nftables mode requires kernel %s or later", utilkernel.NFTablesKubeProxyKernelVersion) + } + } + + return nft, nil +} + // internal struct for string service information type servicePortInfo struct { *proxy.BaseServicePortInfo diff --git a/pkg/util/kernel/constants.go b/pkg/util/kernel/constants.go index 5354f84272e..6775027e7a0 100644 --- a/pkg/util/kernel/constants.go +++ b/pkg/util/kernel/constants.go @@ -49,3 +49,8 @@ const IPVSConnReuseModeFixedKernelVersion = "5.9" const UserNamespacesSupportKernelVersion = "6.3" const TmpfsNoswapSupportKernelVersion = "6.4" + +// NFTablesKubeProxyKernelVersion is the lowest kernel version kube-proxy supports using +// nftables mode with by default. This is not directly related to any specific kernel +// commit; see https://issues.k8s.io/122743#issuecomment-1893922424 +const NFTablesKubeProxyKernelVersion = "5.13"