From 4ffa12b9d9713656e790064bc0f3151002b6aef4 Mon Sep 17 00:00:00 2001 From: Daman Arora Date: Wed, 10 Jan 2024 22:53:09 +0530 Subject: [PATCH 1/2] pkg/proxy/nftables: drop ct-state-invalid rule Signed-off-by: Daman Arora --- pkg/proxy/nftables/helpers_test.go | 16 --------------- pkg/proxy/nftables/proxier.go | 33 +----------------------------- pkg/proxy/nftables/proxier_test.go | 23 --------------------- 3 files changed, 1 insertion(+), 71 deletions(-) diff --git a/pkg/proxy/nftables/helpers_test.go b/pkg/proxy/nftables/helpers_test.go index 3a681e13cc9..b146c607b15 100644 --- a/pkg/proxy/nftables/helpers_test.go +++ b/pkg/proxy/nftables/helpers_test.go @@ -144,14 +144,6 @@ func diffNFTablesChain(nft *knftables.Fake, chain, expected string) string { return cmp.Diff(expected, result) } -// assertNFTablesChainEqual asserts that the indicated chain in nft's table contains -// exactly the rules in expected (in that order). -func assertNFTablesChainEqual(t *testing.T, line string, nft *knftables.Fake, chain, expected string) { - if diff := diffNFTablesChain(nft, chain, expected); diff != "" { - t.Errorf("rules do not match%s:\ndiff:\n%s", line, diff) - } -} - // nftablesTracer holds data used while virtually tracing a packet through a set of // iptables rules type nftablesTracer struct { @@ -310,10 +302,6 @@ var ignoredRegexp = regexp.MustCompile(strings.Join( // The trace tests only check new connections, so for our purposes, this // check always succeeds (and thus can be ignored). `^ct state new`, - - // Likewise, this rule never matches and thus never drops anything, and so - // can be ignored. - `^ct state invalid drop$`, }, "|", )) @@ -640,8 +628,6 @@ func runPacketFlowTests(t *testing.T, line string, nft *knftables.Fake, nodeIPs var testInput = dedent.Dedent(` add table ip testing { comment "rules for kube-proxy" ; } - add chain ip testing forward - add rule ip testing forward ct state invalid drop add chain ip testing mark-for-masquerade add rule ip testing mark-for-masquerade mark set mark or 0x4000 add chain ip testing masquerading @@ -697,7 +683,6 @@ var testExpected = dedent.Dedent(` add chain ip testing external-42NFTM6N-ns2/svc2/tcp/p80 add chain ip testing firewall-allow-check add chain ip testing firewall-check - add chain ip testing forward add chain ip testing mark-for-masquerade add chain ip testing masquerading add chain ip testing service-42NFTM6N-ns2/svc2/tcp/p80 @@ -712,7 +697,6 @@ var testExpected = dedent.Dedent(` add rule ip testing firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return add rule ip testing firewall-allow-check drop add rule ip testing firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check - add rule ip testing forward ct state invalid drop add rule ip testing mark-for-masquerade mark set mark or 0x4000 add rule ip testing masquerading mark and 0x4000 == 0 return add rule ip testing masquerading mark set mark xor 0x4000 diff --git a/pkg/proxy/nftables/proxier.go b/pkg/proxy/nftables/proxier.go index 0ae1281b788..5741e0f05aa 100644 --- a/pkg/proxy/nftables/proxier.go +++ b/pkg/proxy/nftables/proxier.go @@ -84,13 +84,8 @@ const ( // masquerading kubeMarkMasqChain = "mark-for-masquerade" kubeMasqueradingChain = "masquerading" - - // chain for special filtering rules - kubeForwardChain = "forward" ) -const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal" - // internal struct for string service information type servicePortInfo struct { *proxy.BaseServicePortInfo @@ -176,9 +171,6 @@ type Proxier struct { serviceHealthServer healthcheck.ServiceHealthServer healthzServer *healthcheck.ProxierHealthServer - // conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal - conntrackTCPLiberal bool - // nodePortAddresses selects the interfaces where nodePort works. nodePortAddresses *proxyutil.NodePortAddresses // networkInterfacer defines an interface for several net library functions. @@ -211,15 +203,6 @@ func NewProxier(ipFamily v1.IPFamily, ) (*Proxier, error) { nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings) - // Be conservative in what you do, be liberal in what you accept from others. - // If it's non-zero, we mark only out of window RST segments as INVALID. - // Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html - conntrackTCPLiberal := false - if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 { - conntrackTCPLiberal = true - klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets") - } - if initOnly { klog.InfoS("System initialized and --init-only specified") return nil, nil @@ -262,7 +245,6 @@ func NewProxier(ipFamily v1.IPFamily, healthzServer: healthzServer, nodePortAddresses: nodePortAddresses, networkInterfacer: proxyutil.RealNetwork{}, - conntrackTCPLiberal: conntrackTCPLiberal, staleChains: make(map[string]time.Time), } @@ -353,8 +335,6 @@ var nftablesJumpChains = []nftablesJumpChain{ {kubeEndpointsCheckChain, "filter-forward", "ct state new"}, {kubeEndpointsCheckChain, "filter-output", "ct state new"}, - {kubeForwardChain, "filter-forward", ""}, - {kubeFirewallCheckChain, "filter-prerouting", "ct state new"}, {kubeFirewallCheckChain, "filter-output", "ct state new"}, @@ -419,7 +399,7 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { } // Ensure all of our other "top-level" chains exist - for _, chain := range []string{kubeServicesChain, kubeForwardChain, kubeMasqueradingChain, kubeMarkMasqChain} { + for _, chain := range []string{kubeServicesChain, kubeMasqueradingChain, kubeMarkMasqChain} { ensureChain(chain, tx, createdChains) } @@ -449,17 +429,6 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { Rule: "masquerade fully-random", }) - // Drop the packets in INVALID state, which would potentially cause - // unexpected connection reset if nf_conntrack_tcp_be_liberal is not set. - // Ref: https://github.com/kubernetes/kubernetes/issues/74839 - // Ref: https://github.com/kubernetes/kubernetes/issues/117924 - if !proxier.conntrackTCPLiberal { - tx.Add(&knftables.Rule{ - Chain: kubeForwardChain, - Rule: "ct state invalid drop", - }) - } - // Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete" // rather than just "delete" when we want to ensure the set doesn't exist, because // doing just "delete" would return an error if the set didn't exist.) diff --git a/pkg/proxy/nftables/proxier_test.go b/pkg/proxy/nftables/proxier_test.go index 1173789d3ff..5f91b8acb5f 100644 --- a/pkg/proxy/nftables/proxier_test.go +++ b/pkg/proxy/nftables/proxier_test.go @@ -499,8 +499,6 @@ func TestOverallNFTablesRules(t *testing.T) { expected := dedent.Dedent(` add table ip kube-proxy { comment "rules for kube-proxy" ; } - add chain ip kube-proxy forward - add rule ip kube-proxy forward ct state invalid drop add chain ip kube-proxy mark-for-masquerade add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000 add chain ip kube-proxy masquerading @@ -512,7 +510,6 @@ func TestOverallNFTablesRules(t *testing.T) { add rule ip kube-proxy filter-prerouting ct state new jump firewall-check add chain ip kube-proxy filter-forward { type filter hook forward priority -110 ; } add rule ip kube-proxy filter-forward ct state new jump endpoints-check - add rule ip kube-proxy filter-forward jump forward add chain ip kube-proxy filter-input { type filter hook input priority -110 ; } add rule ip kube-proxy filter-input ct state new jump endpoints-check add chain ip kube-proxy filter-output { type filter hook output priority -110 ; } @@ -1226,23 +1223,6 @@ func TestNodePorts(t *testing.T) { } } -func TestDropInvalidRule(t *testing.T) { - for _, tcpLiberal := range []bool{false, true} { - t.Run(fmt.Sprintf("tcpLiberal %t", tcpLiberal), func(t *testing.T) { - nft, fp := NewFakeProxier(v1.IPv4Protocol) - fp.conntrackTCPLiberal = tcpLiberal - fp.syncProxyRules() - - var expected string - if !tcpLiberal { - expected = "ct state invalid drop" - } - - assertNFTablesChainEqual(t, getLine(), nft, kubeForwardChain, expected) - }) - } -} - // TestExternalTrafficPolicyLocal tests that traffic to externally-facing IPs does not get // masqueraded when using Local traffic policy. For traffic from external sources, that // means it can also only be routed to local endpoints, but for traffic from internal @@ -4268,7 +4248,6 @@ func TestSyncProxyRulesRepeated(t *testing.T) { add chain ip kube-proxy filter-input { type filter hook input priority -110 ; } add chain ip kube-proxy filter-output { type filter hook output priority -110 ; } add chain ip kube-proxy firewall-check - add chain ip kube-proxy forward add chain ip kube-proxy mark-for-masquerade add chain ip kube-proxy masquerading add chain ip kube-proxy nat-output { type nat hook output priority -100 ; } @@ -4281,12 +4260,10 @@ func TestSyncProxyRulesRepeated(t *testing.T) { add rule ip kube-proxy endpoints-check fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @no-endpoint-nodeports add rule ip kube-proxy filter-prerouting ct state new jump firewall-check add rule ip kube-proxy filter-forward ct state new jump endpoints-check - add rule ip kube-proxy filter-forward jump forward add rule ip kube-proxy filter-input ct state new jump endpoints-check add rule ip kube-proxy filter-output ct state new jump endpoints-check add rule ip kube-proxy filter-output ct state new jump firewall-check add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips - add rule ip kube-proxy forward ct state invalid drop add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000 add rule ip kube-proxy masquerading mark and 0x4000 == 0 return add rule ip kube-proxy masquerading mark set mark xor 0x4000 From b0e929264fb151cd5d2c502ba0bebf2e218e9e4b Mon Sep 17 00:00:00 2001 From: Daman Arora Date: Thu, 11 Jan 2024 03:11:42 +0530 Subject: [PATCH 2/2] e2e/network/conntrack: rename invalid conntrack state test Signed-off-by: Daman Arora --- test/e2e/network/conntrack.go | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/test/e2e/network/conntrack.go b/test/e2e/network/conntrack.go index 9764f3312cd..cc53a456e68 100644 --- a/test/e2e/network/conntrack.go +++ b/test/e2e/network/conntrack.go @@ -433,13 +433,15 @@ var _ = common.SIGDescribe("Conntrack", func() { }) - // Regression test for #74839, where: - // Packets considered INVALID by conntrack are now dropped. In particular, this fixes - // a problem where spurious retransmits in a long-running TCP connection to a service - // IP could result in the connection being closed with the error "Connection reset by - // peer" + // Regression test for #74839 + // Packets considered INVALID by conntrack are not NATed, this can result + // in a problem where spurious retransmits in a long-running TCP connection + // to a service IP ends up with "Connection reset by peer" error. + // Proxy implementations (which leverage conntrack) can either drop packets + // marked INVALID by conntrack or enforce `nf_conntrack_tcp_be_liberal` to + // overcome this. // xref: https://kubernetes.io/blog/2019/03/29/kube-proxy-subtleties-debugging-an-intermittent-connection-reset/ - ginkgo.It("should drop INVALID conntrack entries [Privileged]", func(ctx context.Context) { + ginkgo.It("proxy implementation should not be vulnerable to the invalid conntrack state bug [Privileged]", func(ctx context.Context) { serverLabel := map[string]string{ "app": "boom-server", } @@ -535,22 +537,21 @@ var _ = common.SIGDescribe("Conntrack", func() { e2epod.NewPodClient(fr).CreateSync(ctx, pod) ginkgo.By("Client pod created") - // The client will open connections against the server - // The server will inject invalid packets - // if conntrack does not drop the invalid packets it will go through without NAT - // so the client will receive an unexpected TCP connection and RST the connection - // the server will log ERROR if that happens - ginkgo.By("checking client pod does not RST the TCP connection because it receives an INVALID packet") + // The client will open connections against the server. + // The server will inject packets with out-of-window sequence numbers and + // if these packets go without NAT client will receive an unexpected TCP + // packet and RST the connection, the server will log ERROR if that happens. + ginkgo.By("checking client pod does not RST the TCP connection because it receives an out-of-window packet") if err := wait.PollUntilContextTimeout(ctx, 5*time.Second, time.Minute, true, logContainsFn("ERROR", "boom-server")); err == nil { logs, err := e2epod.GetPodLogs(ctx, cs, ns, "boom-server", "boom-server") framework.ExpectNoError(err) framework.Logf("boom-server pod logs: %s", logs) - framework.Failf("boom-server pod received a RST from the client") + framework.Failf("boom-server pod received a RST from the client, enabling nf_conntrack_tcp_be_liberal or dropping packets marked invalid by conntrack might help here.") } logs, err := e2epod.GetPodLogs(ctx, cs, ns, "boom-server", "boom-server") framework.ExpectNoError(err) - if !strings.Contains(string(logs), "connection established") { + if !strings.Contains(logs, "connection established") { framework.Logf("boom-server pod logs: %s", logs) framework.Failf("boom-server pod did not send any bad packet to the client") }