mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 15:05:27 +00:00
Merge pull request #122663 from aroradaman/drop-ct-state-invalid-rule
pkg/proxy/nftables: drop conntrack state invalid rule
This commit is contained in:
commit
12fc215656
@ -144,14 +144,6 @@ func diffNFTablesChain(nft *knftables.Fake, chain, expected string) string {
|
||||
return cmp.Diff(expected, result)
|
||||
}
|
||||
|
||||
// assertNFTablesChainEqual asserts that the indicated chain in nft's table contains
|
||||
// exactly the rules in expected (in that order).
|
||||
func assertNFTablesChainEqual(t *testing.T, line string, nft *knftables.Fake, chain, expected string) {
|
||||
if diff := diffNFTablesChain(nft, chain, expected); diff != "" {
|
||||
t.Errorf("rules do not match%s:\ndiff:\n%s", line, diff)
|
||||
}
|
||||
}
|
||||
|
||||
// nftablesTracer holds data used while virtually tracing a packet through a set of
|
||||
// iptables rules
|
||||
type nftablesTracer struct {
|
||||
@ -310,10 +302,6 @@ var ignoredRegexp = regexp.MustCompile(strings.Join(
|
||||
// The trace tests only check new connections, so for our purposes, this
|
||||
// check always succeeds (and thus can be ignored).
|
||||
`^ct state new`,
|
||||
|
||||
// Likewise, this rule never matches and thus never drops anything, and so
|
||||
// can be ignored.
|
||||
`^ct state invalid drop$`,
|
||||
},
|
||||
"|",
|
||||
))
|
||||
@ -640,8 +628,6 @@ func runPacketFlowTests(t *testing.T, line string, nft *knftables.Fake, nodeIPs
|
||||
var testInput = dedent.Dedent(`
|
||||
add table ip testing { comment "rules for kube-proxy" ; }
|
||||
|
||||
add chain ip testing forward
|
||||
add rule ip testing forward ct state invalid drop
|
||||
add chain ip testing mark-for-masquerade
|
||||
add rule ip testing mark-for-masquerade mark set mark or 0x4000
|
||||
add chain ip testing masquerading
|
||||
@ -697,7 +683,6 @@ var testExpected = dedent.Dedent(`
|
||||
add chain ip testing external-42NFTM6N-ns2/svc2/tcp/p80
|
||||
add chain ip testing firewall-allow-check
|
||||
add chain ip testing firewall-check
|
||||
add chain ip testing forward
|
||||
add chain ip testing mark-for-masquerade
|
||||
add chain ip testing masquerading
|
||||
add chain ip testing service-42NFTM6N-ns2/svc2/tcp/p80
|
||||
@ -712,7 +697,6 @@ var testExpected = dedent.Dedent(`
|
||||
add rule ip testing firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
|
||||
add rule ip testing firewall-allow-check drop
|
||||
add rule ip testing firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
|
||||
add rule ip testing forward ct state invalid drop
|
||||
add rule ip testing mark-for-masquerade mark set mark or 0x4000
|
||||
add rule ip testing masquerading mark and 0x4000 == 0 return
|
||||
add rule ip testing masquerading mark set mark xor 0x4000
|
||||
|
@ -84,13 +84,8 @@ const (
|
||||
// masquerading
|
||||
kubeMarkMasqChain = "mark-for-masquerade"
|
||||
kubeMasqueradingChain = "masquerading"
|
||||
|
||||
// chain for special filtering rules
|
||||
kubeForwardChain = "forward"
|
||||
)
|
||||
|
||||
const sysctlNFConntrackTCPBeLiberal = "net/netfilter/nf_conntrack_tcp_be_liberal"
|
||||
|
||||
// internal struct for string service information
|
||||
type servicePortInfo struct {
|
||||
*proxy.BaseServicePortInfo
|
||||
@ -176,9 +171,6 @@ type Proxier struct {
|
||||
serviceHealthServer healthcheck.ServiceHealthServer
|
||||
healthzServer *healthcheck.ProxierHealthServer
|
||||
|
||||
// conntrackTCPLiberal indicates whether the system sets the kernel nf_conntrack_tcp_be_liberal
|
||||
conntrackTCPLiberal bool
|
||||
|
||||
// nodePortAddresses selects the interfaces where nodePort works.
|
||||
nodePortAddresses *proxyutil.NodePortAddresses
|
||||
// networkInterfacer defines an interface for several net library functions.
|
||||
@ -211,15 +203,6 @@ func NewProxier(ipFamily v1.IPFamily,
|
||||
) (*Proxier, error) {
|
||||
nodePortAddresses := proxyutil.NewNodePortAddresses(ipFamily, nodePortAddressStrings)
|
||||
|
||||
// Be conservative in what you do, be liberal in what you accept from others.
|
||||
// If it's non-zero, we mark only out of window RST segments as INVALID.
|
||||
// Ref: https://docs.kernel.org/networking/nf_conntrack-sysctl.html
|
||||
conntrackTCPLiberal := false
|
||||
if val, err := sysctl.GetSysctl(sysctlNFConntrackTCPBeLiberal); err == nil && val != 0 {
|
||||
conntrackTCPLiberal = true
|
||||
klog.InfoS("nf_conntrack_tcp_be_liberal set, not installing DROP rules for INVALID packets")
|
||||
}
|
||||
|
||||
if initOnly {
|
||||
klog.InfoS("System initialized and --init-only specified")
|
||||
return nil, nil
|
||||
@ -262,7 +245,6 @@ func NewProxier(ipFamily v1.IPFamily,
|
||||
healthzServer: healthzServer,
|
||||
nodePortAddresses: nodePortAddresses,
|
||||
networkInterfacer: proxyutil.RealNetwork{},
|
||||
conntrackTCPLiberal: conntrackTCPLiberal,
|
||||
staleChains: make(map[string]time.Time),
|
||||
}
|
||||
|
||||
@ -353,8 +335,6 @@ var nftablesJumpChains = []nftablesJumpChain{
|
||||
{kubeEndpointsCheckChain, "filter-forward", "ct state new"},
|
||||
{kubeEndpointsCheckChain, "filter-output", "ct state new"},
|
||||
|
||||
{kubeForwardChain, "filter-forward", ""},
|
||||
|
||||
{kubeFirewallCheckChain, "filter-prerouting", "ct state new"},
|
||||
{kubeFirewallCheckChain, "filter-output", "ct state new"},
|
||||
|
||||
@ -419,7 +399,7 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
|
||||
}
|
||||
|
||||
// Ensure all of our other "top-level" chains exist
|
||||
for _, chain := range []string{kubeServicesChain, kubeForwardChain, kubeMasqueradingChain, kubeMarkMasqChain} {
|
||||
for _, chain := range []string{kubeServicesChain, kubeMasqueradingChain, kubeMarkMasqChain} {
|
||||
ensureChain(chain, tx, createdChains)
|
||||
}
|
||||
|
||||
@ -449,17 +429,6 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
|
||||
Rule: "masquerade fully-random",
|
||||
})
|
||||
|
||||
// Drop the packets in INVALID state, which would potentially cause
|
||||
// unexpected connection reset if nf_conntrack_tcp_be_liberal is not set.
|
||||
// Ref: https://github.com/kubernetes/kubernetes/issues/74839
|
||||
// Ref: https://github.com/kubernetes/kubernetes/issues/117924
|
||||
if !proxier.conntrackTCPLiberal {
|
||||
tx.Add(&knftables.Rule{
|
||||
Chain: kubeForwardChain,
|
||||
Rule: "ct state invalid drop",
|
||||
})
|
||||
}
|
||||
|
||||
// Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete"
|
||||
// rather than just "delete" when we want to ensure the set doesn't exist, because
|
||||
// doing just "delete" would return an error if the set didn't exist.)
|
||||
|
@ -499,8 +499,6 @@ func TestOverallNFTablesRules(t *testing.T) {
|
||||
expected := dedent.Dedent(`
|
||||
add table ip kube-proxy { comment "rules for kube-proxy" ; }
|
||||
|
||||
add chain ip kube-proxy forward
|
||||
add rule ip kube-proxy forward ct state invalid drop
|
||||
add chain ip kube-proxy mark-for-masquerade
|
||||
add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000
|
||||
add chain ip kube-proxy masquerading
|
||||
@ -512,7 +510,6 @@ func TestOverallNFTablesRules(t *testing.T) {
|
||||
add rule ip kube-proxy filter-prerouting ct state new jump firewall-check
|
||||
add chain ip kube-proxy filter-forward { type filter hook forward priority -110 ; }
|
||||
add rule ip kube-proxy filter-forward ct state new jump endpoints-check
|
||||
add rule ip kube-proxy filter-forward jump forward
|
||||
add chain ip kube-proxy filter-input { type filter hook input priority -110 ; }
|
||||
add rule ip kube-proxy filter-input ct state new jump endpoints-check
|
||||
add chain ip kube-proxy filter-output { type filter hook output priority -110 ; }
|
||||
@ -1226,23 +1223,6 @@ func TestNodePorts(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestDropInvalidRule(t *testing.T) {
|
||||
for _, tcpLiberal := range []bool{false, true} {
|
||||
t.Run(fmt.Sprintf("tcpLiberal %t", tcpLiberal), func(t *testing.T) {
|
||||
nft, fp := NewFakeProxier(v1.IPv4Protocol)
|
||||
fp.conntrackTCPLiberal = tcpLiberal
|
||||
fp.syncProxyRules()
|
||||
|
||||
var expected string
|
||||
if !tcpLiberal {
|
||||
expected = "ct state invalid drop"
|
||||
}
|
||||
|
||||
assertNFTablesChainEqual(t, getLine(), nft, kubeForwardChain, expected)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestExternalTrafficPolicyLocal tests that traffic to externally-facing IPs does not get
|
||||
// masqueraded when using Local traffic policy. For traffic from external sources, that
|
||||
// means it can also only be routed to local endpoints, but for traffic from internal
|
||||
@ -4268,7 +4248,6 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
|
||||
add chain ip kube-proxy filter-input { type filter hook input priority -110 ; }
|
||||
add chain ip kube-proxy filter-output { type filter hook output priority -110 ; }
|
||||
add chain ip kube-proxy firewall-check
|
||||
add chain ip kube-proxy forward
|
||||
add chain ip kube-proxy mark-for-masquerade
|
||||
add chain ip kube-proxy masquerading
|
||||
add chain ip kube-proxy nat-output { type nat hook output priority -100 ; }
|
||||
@ -4281,12 +4260,10 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
|
||||
add rule ip kube-proxy endpoints-check fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @no-endpoint-nodeports
|
||||
add rule ip kube-proxy filter-prerouting ct state new jump firewall-check
|
||||
add rule ip kube-proxy filter-forward ct state new jump endpoints-check
|
||||
add rule ip kube-proxy filter-forward jump forward
|
||||
add rule ip kube-proxy filter-input ct state new jump endpoints-check
|
||||
add rule ip kube-proxy filter-output ct state new jump endpoints-check
|
||||
add rule ip kube-proxy filter-output ct state new jump firewall-check
|
||||
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
|
||||
add rule ip kube-proxy forward ct state invalid drop
|
||||
add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000
|
||||
add rule ip kube-proxy masquerading mark and 0x4000 == 0 return
|
||||
add rule ip kube-proxy masquerading mark set mark xor 0x4000
|
||||
|
@ -433,13 +433,15 @@ var _ = common.SIGDescribe("Conntrack", func() {
|
||||
|
||||
})
|
||||
|
||||
// Regression test for #74839, where:
|
||||
// Packets considered INVALID by conntrack are now dropped. In particular, this fixes
|
||||
// a problem where spurious retransmits in a long-running TCP connection to a service
|
||||
// IP could result in the connection being closed with the error "Connection reset by
|
||||
// peer"
|
||||
// Regression test for #74839
|
||||
// Packets considered INVALID by conntrack are not NATed, this can result
|
||||
// in a problem where spurious retransmits in a long-running TCP connection
|
||||
// to a service IP ends up with "Connection reset by peer" error.
|
||||
// Proxy implementations (which leverage conntrack) can either drop packets
|
||||
// marked INVALID by conntrack or enforce `nf_conntrack_tcp_be_liberal` to
|
||||
// overcome this.
|
||||
// xref: https://kubernetes.io/blog/2019/03/29/kube-proxy-subtleties-debugging-an-intermittent-connection-reset/
|
||||
ginkgo.It("should drop INVALID conntrack entries [Privileged]", func(ctx context.Context) {
|
||||
ginkgo.It("proxy implementation should not be vulnerable to the invalid conntrack state bug [Privileged]", func(ctx context.Context) {
|
||||
serverLabel := map[string]string{
|
||||
"app": "boom-server",
|
||||
}
|
||||
@ -535,22 +537,21 @@ var _ = common.SIGDescribe("Conntrack", func() {
|
||||
e2epod.NewPodClient(fr).CreateSync(ctx, pod)
|
||||
ginkgo.By("Client pod created")
|
||||
|
||||
// The client will open connections against the server
|
||||
// The server will inject invalid packets
|
||||
// if conntrack does not drop the invalid packets it will go through without NAT
|
||||
// so the client will receive an unexpected TCP connection and RST the connection
|
||||
// the server will log ERROR if that happens
|
||||
ginkgo.By("checking client pod does not RST the TCP connection because it receives an INVALID packet")
|
||||
// The client will open connections against the server.
|
||||
// The server will inject packets with out-of-window sequence numbers and
|
||||
// if these packets go without NAT client will receive an unexpected TCP
|
||||
// packet and RST the connection, the server will log ERROR if that happens.
|
||||
ginkgo.By("checking client pod does not RST the TCP connection because it receives an out-of-window packet")
|
||||
if err := wait.PollUntilContextTimeout(ctx, 5*time.Second, time.Minute, true, logContainsFn("ERROR", "boom-server")); err == nil {
|
||||
logs, err := e2epod.GetPodLogs(ctx, cs, ns, "boom-server", "boom-server")
|
||||
framework.ExpectNoError(err)
|
||||
framework.Logf("boom-server pod logs: %s", logs)
|
||||
framework.Failf("boom-server pod received a RST from the client")
|
||||
framework.Failf("boom-server pod received a RST from the client, enabling nf_conntrack_tcp_be_liberal or dropping packets marked invalid by conntrack might help here.")
|
||||
}
|
||||
|
||||
logs, err := e2epod.GetPodLogs(ctx, cs, ns, "boom-server", "boom-server")
|
||||
framework.ExpectNoError(err)
|
||||
if !strings.Contains(string(logs), "connection established") {
|
||||
if !strings.Contains(logs, "connection established") {
|
||||
framework.Logf("boom-server pod logs: %s", logs)
|
||||
framework.Failf("boom-server pod did not send any bad packet to the client")
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user