From c12534d8b4724424405f848fc7a9afe830859891 Mon Sep 17 00:00:00 2001 From: Dan Winship Date: Thu, 11 Jun 2020 09:11:03 -0400 Subject: [PATCH] kubelet, kube-proxy: unmark packets before masquerading them It seems that if you set the packet mark on a packet and then route that packet through a kernel VXLAN interface, the VXLAN-encapsulated packet will still have the mark from the original packet. Since our NAT rules are based on the packet mark, this was causing us to double-NAT some packets, which then triggered a kernel checksumming bug. But even without the checksum bug, there are reasons to avoid double-NATting, so fix the rules to unmark the packets before masquerading them. --- pkg/kubelet/kubelet_network_linux.go | 26 ++++++++++++++++++++------ pkg/kubelet/kubelet_network_test.go | 4 ++-- pkg/proxy/iptables/proxier.go | 18 ++++++++++++++---- pkg/proxy/iptables/proxier_test.go | 9 ++++++--- pkg/proxy/ipvs/proxier.go | 18 ++++++++++++++---- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/pkg/kubelet/kubelet_network_linux.go b/pkg/kubelet/kubelet_network_linux.go index 4b5ca87674c..6b0ab9a4df9 100644 --- a/pkg/kubelet/kubelet_network_linux.go +++ b/pkg/kubelet/kubelet_network_linux.go @@ -62,7 +62,7 @@ func (kl *Kubelet) syncNetworkUtil() { klog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, KubeMarkDropChain, err) return } - if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkDropChain, "-j", "MARK", "--set-xmark", dropMark); err != nil { + if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkDropChain, "-j", "MARK", "--or-mark", dropMark); err != nil { klog.Errorf("Failed to ensure marking rule for %v: %v", KubeMarkDropChain, err) return } @@ -72,7 +72,7 @@ func (kl *Kubelet) syncNetworkUtil() { } if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableFilter, KubeFirewallChain, "-m", "comment", "--comment", "kubernetes firewall for dropping marked packets", - "-m", "mark", "--mark", dropMark, + "-m", "mark", "--mark", fmt.Sprintf("%s/%s", dropMark, dropMark), "-j", "DROP"); err != nil { klog.Errorf("Failed to ensure rule to drop packet marked by %v in %v chain %v: %v", KubeMarkDropChain, utiliptables.TableFilter, KubeFirewallChain, err) return @@ -112,7 +112,7 @@ func (kl *Kubelet) syncNetworkUtil() { klog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, KubePostroutingChain, err) return } - if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkMasqChain, "-j", "MARK", "--set-xmark", masqueradeMark); err != nil { + if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkMasqChain, "-j", "MARK", "--or-mark", masqueradeMark); err != nil { klog.Errorf("Failed to ensure marking rule for %v: %v", KubeMarkMasqChain, err) return } @@ -121,12 +121,26 @@ func (kl *Kubelet) syncNetworkUtil() { klog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, KubePostroutingChain, err) return } - // Establish the masquerading rule. + + // Set up KUBE-POSTROUTING to unmark and masquerade marked packets // NB: THIS MUST MATCH the corresponding code in the iptables and ipvs // modes of kube-proxy + if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubePostroutingChain, + "-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", masqueradeMark, masqueradeMark), + "-j", "RETURN"); err != nil { + klog.Errorf("Failed to ensure filtering rule for %v: %v", KubePostroutingChain, err) + return + } + // Clear the mark to avoid re-masquerading if the packet re-traverses the network stack. + // We know the mark bit is currently set so we can use --xor-mark to clear it (without needing + // to Sprintf another bitmask). + if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubePostroutingChain, + "-j", "MARK", "--xor-mark", masqueradeMark); err != nil { + klog.Errorf("Failed to ensure unmarking rule for %v: %v", KubePostroutingChain, err) + return + } masqRule := []string{ "-m", "comment", "--comment", "kubernetes service traffic requiring SNAT", - "-m", "mark", "--mark", masqueradeMark, "-j", "MASQUERADE", } if kl.iptClient.HasRandomFully() { @@ -141,5 +155,5 @@ func (kl *Kubelet) syncNetworkUtil() { // getIPTablesMark returns the fwmark given the bit func getIPTablesMark(bit int) string { value := 1 << uint(bit) - return fmt.Sprintf("%#08x/%#08x", value, value) + return fmt.Sprintf("%#08x", value) } diff --git a/pkg/kubelet/kubelet_network_test.go b/pkg/kubelet/kubelet_network_test.go index 2dd0a5ffcf3..8ba8e4415e1 100644 --- a/pkg/kubelet/kubelet_network_test.go +++ b/pkg/kubelet/kubelet_network_test.go @@ -31,11 +31,11 @@ func TestGetIPTablesMark(t *testing.T) { }{ { 14, - "0x00004000/0x00004000", + "0x00004000", }, { 15, - "0x00008000/0x00008000", + "0x00008000", }, } for _, tc := range tests { diff --git a/pkg/proxy/iptables/proxier.go b/pkg/proxy/iptables/proxier.go index eb121ac304a..6ad2af83002 100644 --- a/pkg/proxy/iptables/proxier.go +++ b/pkg/proxy/iptables/proxier.go @@ -282,7 +282,7 @@ func NewProxier(ipt utiliptables.Interface, // Generate the masquerade mark to use for SNAT rules. masqueradeValue := 1 << uint(masqueradeBit) - masqueradeMark := fmt.Sprintf("%#08x/%#08x", masqueradeValue, masqueradeValue) + masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) klog.V(2).Infof("iptables(%s) masquerade mark: %s", ipt.Protocol(), masqueradeMark) endpointSlicesEnabled := utilfeature.DefaultFeatureGate.Enabled(features.EndpointSliceProxying) @@ -919,10 +919,20 @@ func (proxier *Proxier) syncProxyRules() { // this so that it is easier to flush and change, for example if the mark // value should ever change. // NB: THIS MUST MATCH the corresponding code in the kubelet + writeLine(proxier.natRules, []string{ + "-A", string(kubePostroutingChain), + "-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), + "-j", "RETURN", + }...) + // Clear the mark to avoid re-masquerading if the packet re-traverses the network stack. + writeLine(proxier.natRules, []string{ + "-A", string(kubePostroutingChain), + // XOR proxier.masqueradeMark to unset it + "-j", "MARK", "--xor-mark", proxier.masqueradeMark, + }...) masqRule := []string{ "-A", string(kubePostroutingChain), "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`, - "-m", "mark", "--mark", proxier.masqueradeMark, "-j", "MASQUERADE", } if proxier.iptables.HasRandomFully() { @@ -935,7 +945,7 @@ func (proxier *Proxier) syncProxyRules() { // value should ever change. writeLine(proxier.natRules, []string{ "-A", string(KubeMarkMasqChain), - "-j", "MARK", "--set-xmark", proxier.masqueradeMark, + "-j", "MARK", "--or-mark", proxier.masqueradeMark, }...) // Accumulate NAT chains to keep. @@ -1514,7 +1524,7 @@ func (proxier *Proxier) syncProxyRules() { writeLine(proxier.filterRules, "-A", string(kubeForwardChain), "-m", "comment", "--comment", `"kubernetes forwarding rules"`, - "-m", "mark", "--mark", proxier.masqueradeMark, + "-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), "-j", "ACCEPT", ) diff --git a/pkg/proxy/iptables/proxier_test.go b/pkg/proxy/iptables/proxier_test.go index ff77764bb7c..0b12b4f7bf7 100644 --- a/pkg/proxy/iptables/proxier_test.go +++ b/pkg/proxy/iptables/proxier_test.go @@ -357,6 +357,7 @@ func NewFakeProxier(ipt utiliptables.Interface, endpointSlicesEnabled bool) *Pro endpointsMap: make(proxy.EndpointsMap), endpointsChanges: proxy.NewEndpointChangeTracker(testHostname, newEndpointInfo, nil, nil, endpointSlicesEnabled), iptables: ipt, + masqueradeMark: "0x4000", localDetector: detectLocal, hostname: testHostname, portsMap: make(map[utilproxy.LocalPort]utilproxy.Closeable), @@ -2418,7 +2419,7 @@ func TestEndpointSliceE2E(t *testing.T) { :KUBE-EXTERNAL-SERVICES - [0:0] :KUBE-FORWARD - [0:0] -A KUBE-FORWARD -m conntrack --ctstate INVALID -j DROP --A KUBE-FORWARD -m comment --comment "kubernetes forwarding rules" -m mark --mark -j ACCEPT +-A KUBE-FORWARD -m comment --comment "kubernetes forwarding rules" -m mark --mark 0x4000/0x4000 -j ACCEPT -A KUBE-FORWARD -m comment --comment "kubernetes forwarding conntrack pod source rule" -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT -A KUBE-FORWARD -m comment --comment "kubernetes forwarding conntrack pod destination rule" -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT COMMIT @@ -2431,8 +2432,10 @@ COMMIT :KUBE-SEP-3JOIVZTXZZRGORX4 - [0:0] :KUBE-SEP-IO5XOSKPAXIFQXAJ - [0:0] :KUBE-SEP-XGJFVO3L2O5SRFNT - [0:0] --A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -m mark --mark -j MASQUERADE --A KUBE-MARK-MASQ -j MARK --set-xmark +-A KUBE-POSTROUTING -m mark ! --mark 0x4000/0x4000 -j RETURN +-A KUBE-POSTROUTING -j MARK --xor-mark 0x4000 +-A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -j MASQUERADE +-A KUBE-MARK-MASQ -j MARK --or-mark 0x4000 -A KUBE-SERVICES -m comment --comment "ns1/svc1 cluster IP" -m tcp -p tcp -d 172.20.1.1/32 --dport 0 ! -s 10.0.0.0/24 -j KUBE-MARK-MASQ -A KUBE-SERVICES -m comment --comment "ns1/svc1 cluster IP" -m tcp -p tcp -d 172.20.1.1/32 --dport 0 -j KUBE-SVC-AQI2S6QIMU7PVVRP -A KUBE-SVC-AQI2S6QIMU7PVVRP -m comment --comment ns1/svc1 -m statistic --mode random --probability 0.3333333333 -j KUBE-SEP-3JOIVZTXZZRGORX4 diff --git a/pkg/proxy/ipvs/proxier.go b/pkg/proxy/ipvs/proxier.go index 77270dae477..1d4433d77e6 100644 --- a/pkg/proxy/ipvs/proxier.go +++ b/pkg/proxy/ipvs/proxier.go @@ -418,7 +418,7 @@ func NewProxier(ipt utiliptables.Interface, // Generate the masquerade mark to use for SNAT rules. masqueradeValue := 1 << uint(masqueradeBit) - masqueradeMark := fmt.Sprintf("%#08x/%#08x", masqueradeValue, masqueradeValue) + masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue) isIPv6 := utilnet.IsIPv6(nodeIP) @@ -1758,7 +1758,7 @@ func (proxier *Proxier) writeIptablesRules() { writeLine(proxier.filterRules, "-A", string(KubeForwardChain), "-m", "comment", "--comment", `"kubernetes forwarding rules"`, - "-m", "mark", "--mark", proxier.masqueradeMark, + "-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), "-j", "ACCEPT", ) @@ -1842,10 +1842,20 @@ func (proxier *Proxier) createAndLinkeKubeChain() { // this so that it is easier to flush and change, for example if the mark // value should ever change. // NB: THIS MUST MATCH the corresponding code in the kubelet + writeLine(proxier.natRules, []string{ + "-A", string(kubePostroutingChain), + "-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark), + "-j", "RETURN", + }...) + // Clear the mark to avoid re-masquerading if the packet re-traverses the network stack. + writeLine(proxier.natRules, []string{ + "-A", string(kubePostroutingChain), + // XOR proxier.masqueradeMark to unset it + "-j", "MARK", "--xor-mark", proxier.masqueradeMark, + }...) masqRule := []string{ "-A", string(kubePostroutingChain), "-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`, - "-m", "mark", "--mark", proxier.masqueradeMark, "-j", "MASQUERADE", } if proxier.iptables.HasRandomFully() { @@ -1858,7 +1868,7 @@ func (proxier *Proxier) createAndLinkeKubeChain() { // value should ever change. writeLine(proxier.natRules, []string{ "-A", string(KubeMarkMasqChain), - "-j", "MARK", "--set-xmark", proxier.masqueradeMark, + "-j", "MARK", "--or-mark", proxier.masqueradeMark, }...) }