From 933bcc123b4f0cd6ef6f268da8d60eeed7b1b604 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Mon, 4 Sep 2023 15:58:13 +0000 Subject: [PATCH] only drop invalid cstate packets if non liberal Conntrack invalid packets may cause unexpected and subtle bugs on esblished connections, because of that we install by default an iptables rules that drops the packets with this conntrack state. However, there are network scenarios, specially those that use multihoming nodes, that may have legit traffic that is detected by conntrack as invalid, hence these iptables rules are causing problems dropping this traffic. An alternative to solve the spurious problems caused by the invalid connectrack packets is to set the sysctl nf_conntrack_tcp_be_liberal option, but this is a system wide setting and we don't want kube-proxy to be opinionated about the whole node networking configuration. Kube-proxy will only install the DROP rules for invalid conntrack states if the nf_conntrack_tcp_be_liberal is not set. Change-Id: I5eb326931ed915f5ae74d210f0a375842b6a790e --- pkg/proxy/iptables/proxier.go | 33 +++++++++++++++++------ pkg/proxy/iptables/proxier_test.go | 43 ++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/pkg/proxy/iptables/proxier.go b/pkg/proxy/iptables/proxier.go index df7a282111f..e8442ff9632 100644 --- a/pkg/proxy/iptables/proxier.go +++ b/pkg/proxy/iptables/proxier.go @@ -92,6 +92,7 @@ const ( const sysctlRouteLocalnet = "net/ipv4/conf/all/route_localnet" const sysctlBridgeCallIPTables = "net/bridge/bridge-nf-call-iptables" +const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal" // internal struct for string service information type servicePortInfo struct { @@ -196,6 +197,10 @@ type Proxier struct { // localhostNodePorts indicates whether we allow NodePort services to be accessed // via localhost. localhostNodePorts bool + + // conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal + conntrackTCPLiberal bool + // nodePortAddresses selects the interfaces where nodePort works. nodePortAddresses *proxyutil.NodePortAddresses // networkInterfacer defines an interface for several net library functions. @@ -241,6 +246,14 @@ func NewProxier(ipFamily v1.IPFamily, } } + // Be conservative in what you do, be liberal in what you accept from others. + // If it's non-zero, we mark only out of window RST segments as INVALID. + // Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html + conntrackTCPLiberal := false + if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 { + conntrackTCPLiberal = true + klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets") + } // Proxy needs br_netfilter and bridge-nf-call-iptables=1 when containers // are connected to a Linux bridge (but not SDN bridges). Until most // plugins handle this, log when config is missing @@ -282,6 +295,7 @@ func NewProxier(ipFamily v1.IPFamily, localhostNodePorts: localhostNodePorts, nodePortAddresses: nodePortAddresses, networkInterfacer: proxyutil.RealNetwork{}, + conntrackTCPLiberal: conntrackTCPLiberal, } burstSyncs := 2 @@ -1443,14 +1457,17 @@ func (proxier *Proxier) syncProxyRules() { } // Drop the packets in INVALID state, which would potentially cause - // unexpected connection reset. - // https://github.com/kubernetes/kubernetes/issues/74839 - proxier.filterRules.Write( - "-A", string(kubeForwardChain), - "-m", "conntrack", - "--ctstate", "INVALID", - "-j", "DROP", - ) + // unexpected connection reset if nf_conntrack_tcp_be_liberal is not set. + // Ref: https://github.com/kubernetes/kubernetes/issues/74839 + // Ref: https://github.com/kubernetes/kubernetes/issues/117924 + if !proxier.conntrackTCPLiberal { + proxier.filterRules.Write( + "-A", string(kubeForwardChain), + "-m", "conntrack", + "--ctstate", "INVALID", + "-j", "DROP", + ) + } // If the masqueradeMark has been added then we want to forward that same // traffic, this allows NodePort traffic to be forwarded even if the default diff --git a/pkg/proxy/iptables/proxier_test.go b/pkg/proxy/iptables/proxier_test.go index 591a87aee8f..ca00461124b 100644 --- a/pkg/proxy/iptables/proxier_test.go +++ b/pkg/proxy/iptables/proxier_test.go @@ -2565,6 +2565,49 @@ func TestHealthCheckNodePort(t *testing.T) { }) } +func TestDropInvalidRule(t *testing.T) { + for _, testcase := range []bool{false, true} { + t.Run(fmt.Sprintf("tcpLiberal %t", testcase), func(t *testing.T) { + ipt := iptablestest.NewFake() + fp := NewFakeProxier(ipt) + fp.conntrackTCPLiberal = testcase + fp.syncProxyRules() + + expected := dedent.Dedent(` + *filter + :KUBE-NODEPORTS - [0:0] + :KUBE-SERVICES - [0:0] + :KUBE-EXTERNAL-SERVICES - [0:0] + :KUBE-FIREWALL - [0:0] + :KUBE-FORWARD - [0:0] + :KUBE-PROXY-FIREWALL - [0:0] + -A KUBE-FIREWALL -m comment --comment "block incoming localnet connections" -d 127.0.0.0/8 ! -s 127.0.0.0/8 -m conntrack ! --ctstate RELATED,ESTABLISHED,DNAT -j DROP`) + if !testcase { + expected += "\n-A KUBE-FORWARD -m conntrack --ctstate INVALID -j DROP" + } + + expected += dedent.Dedent(` + -A KUBE-FORWARD -m comment --comment "kubernetes forwarding rules" -m mark --mark 0x4000/0x4000 -j ACCEPT + -A KUBE-FORWARD -m comment --comment "kubernetes forwarding conntrack rule" -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT + COMMIT + *nat + :KUBE-NODEPORTS - [0:0] + :KUBE-SERVICES - [0:0] + :KUBE-MARK-MASQ - [0:0] + :KUBE-POSTROUTING - [0:0] + -A KUBE-SERVICES -m comment --comment "kubernetes service nodeports; NOTE: this must be the last rule in this chain" -m addrtype --dst-type LOCAL -j KUBE-NODEPORTS + -A KUBE-MARK-MASQ -j MARK --or-mark 0x4000 + -A KUBE-POSTROUTING -m mark ! --mark 0x4000/0x4000 -j RETURN + -A KUBE-POSTROUTING -j MARK --xor-mark 0x4000 + -A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -j MASQUERADE + COMMIT + `) + + assertIPTablesRulesEqual(t, getLine(), true, expected, fp.iptablesData.String()) + }) + } +} + func TestMasqueradeRule(t *testing.T) { for _, testcase := range []bool{false, true} { ipt := iptablestest.NewFake().SetHasRandomFully(testcase)