Merge pull request #122296 from tnqn/nftables-kernel-requirement

kube-proxy: change implementation of LoadBalancerSourceRanges for wider kernel support
This commit is contained in:
Kubernetes Prow Robot 2024-01-08 17:30:27 +01:00 committed by GitHub
commit f538feed8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 96 additions and 85 deletions

View File

@ -210,6 +210,17 @@ func (tracer *nftablesTracer) addressMatches(ipStr, not, ruleAddress string) boo
}
}
func (tracer *nftablesTracer) noneAddressesMatch(ipStr, ruleAddress string) bool {
ruleAddress = strings.ReplaceAll(ruleAddress, " ", "")
addresses := strings.Split(ruleAddress, ",")
for _, address := range addresses {
if tracer.addressMatches(ipStr, "", address) {
return false
}
}
return true
}
// matchDestIPOnly checks an "ip daddr" against a set/map, and returns the matching
// Element, if found.
func (tracer *nftablesTracer) matchDestIPOnly(elements []*knftables.Element, destIP string) *knftables.Element {
@ -267,6 +278,7 @@ func (tracer *nftablesTracer) matchDestPort(elements []*knftables.Element, proto
// match verdictRegexp.
var destAddrRegexp = regexp.MustCompile(`^ip6* daddr (!= )?(\S+)`)
var destAddrLookupRegexp = regexp.MustCompile(`^ip6* daddr != \{([^}]*)\}`)
var destAddrLocalRegexp = regexp.MustCompile(`^fib daddr type local`)
var destPortRegexp = regexp.MustCompile(`^(tcp|udp|sctp) dport (\d+)`)
var destIPOnlyLookupRegexp = regexp.MustCompile(`^ip6* daddr @(\S+)`)
@ -278,6 +290,7 @@ var destDispatchRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th d
var destPortDispatchRegexp = regexp.MustCompile(`^meta l4proto \. th dport vmap @(\S+)$`)
var sourceAddrRegexp = regexp.MustCompile(`^ip6* saddr (!= )?(\S+)`)
var sourceAddrLookupRegexp = regexp.MustCompile(`^ip6* saddr != \{([^}]*)\}`)
var sourceAddrLocalRegexp = regexp.MustCompile(`^fib saddr type local`)
var endpointVMAPRegexp = regexp.MustCompile(`^numgen random mod \d+ vmap \{(.*)\}$`)
@ -400,6 +413,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP
rule = element.Value[0]
}
case destAddrLookupRegexp.MatchString(rule):
// `^ip6* daddr != \{([^}]*)\}`
// Tests whether destIP doesn't match an anonymous set.
match := destAddrLookupRegexp.FindStringSubmatch(rule)
rule = strings.TrimPrefix(rule, match[0])
if !tracer.noneAddressesMatch(destIP, match[1]) {
rule = ""
break
}
case destAddrRegexp.MatchString(rule):
// `^ip6* daddr (!= )?(\S+)`
// Tests whether destIP does/doesn't match a literal.
@ -432,6 +455,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP
break
}
case sourceAddrLookupRegexp.MatchString(rule):
// `^ip6* saddr != \{([^}]*)\}`
// Tests whether sourceIP doesn't match an anonymous set.
match := sourceAddrLookupRegexp.FindStringSubmatch(rule)
rule = strings.TrimPrefix(rule, match[0])
if !tracer.noneAddressesMatch(sourceIP, match[1]) {
rule = ""
break
}
case sourceAddrRegexp.MatchString(rule):
// `^ip6* saddr (!= )?(\S+)`
// Tests whether sourceIP does/doesn't match a literal.

View File

@ -79,10 +79,8 @@ const (
kubeRejectChain = "reject-chain"
// LoadBalancerSourceRanges handling
kubeFirewallSet = "firewall"
kubeFirewallCheckChain = "firewall-check"
kubeFirewallAllowSet = "firewall-allow"
kubeFirewallAllowCheckChain = "firewall-allow-check"
kubeFirewallIPsMap = "firewall-ips"
kubeFirewallCheckChain = "firewall-check"
// masquerading
kubeMarkMasqChain = "mark-for-masquerade"
@ -102,6 +100,7 @@ type servicePortInfo struct {
clusterPolicyChainName string
localPolicyChainName string
externalChainName string
firewallChainName string
}
// returns a new proxy.ServicePort which abstracts a serviceInfo
@ -117,6 +116,7 @@ func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *pro
svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase
svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase
svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase
svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase
return svcPort
}
@ -546,38 +546,20 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
}
// Set up LoadBalancerSourceRanges firewalling
tx.Add(&knftables.Set{
Name: kubeFirewallSet,
Type: ipvX_addr + " . inet_proto . inet_service",
tx.Add(&knftables.Map{
Name: kubeFirewallIPsMap,
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
})
tx.Add(&knftables.Set{
Name: kubeFirewallAllowSet,
Type: ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr,
Flags: []knftables.SetFlag{knftables.IntervalFlag},
Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"),
})
ensureChain(kubeFirewallCheckChain, tx, createdChains)
ensureChain(kubeFirewallAllowCheckChain, tx, createdChains)
tx.Add(&knftables.Rule{
Chain: kubeFirewallCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet,
"jump", kubeFirewallAllowCheckChain,
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
"vmap", "@", kubeFirewallIPsMap,
),
})
tx.Add(&knftables.Rule{
Chain: kubeFirewallAllowCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet,
"return",
),
})
tx.Add(&knftables.Rule{
Chain: kubeFirewallAllowCheckChain,
Rule: "drop",
})
// Set up service dispatch
tx.Add(&knftables.Map{
@ -827,6 +809,7 @@ const (
serviceExternalChainNamePrefix = "external-"
servicePortEndpointChainNamePrefix = "endpoint-"
servicePortEndpointAffinityNamePrefix = "affinity-"
servicePortFirewallChainNamePrefix = "firewall-"
)
// hashAndTruncate prefixes name with a hash of itself and then truncates to
@ -1001,11 +984,8 @@ func (proxier *Proxier) syncProxyRules() {
}
// We currently fully-rebuild our sets and maps on each resync
tx.Flush(&knftables.Set{
Name: kubeFirewallSet,
})
tx.Flush(&knftables.Set{
Name: kubeFirewallAllowSet,
tx.Flush(&knftables.Map{
Name: kubeFirewallIPsMap,
})
tx.Flush(&knftables.Map{
Name: kubeNoEndpointServicesMap,
@ -1208,6 +1188,44 @@ func (proxier *Proxier) syncProxyRules() {
}
}
usesFWChain := len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0
fwChain := svcInfo.firewallChainName
if usesFWChain {
ensureChain(fwChain, tx, activeChains)
var sources []string
allowFromNode := false
for _, src := range svcInfo.LoadBalancerSourceRanges() {
_, cidr, _ := netutils.ParseCIDRSloppy(src)
if cidr == nil {
continue
}
if len(sources) > 0 {
sources = append(sources, ",")
}
sources = append(sources, src)
if cidr.Contains(proxier.nodeIP) {
allowFromNode = true
}
}
// For VIP-like LBs, the VIP is often added as a local
// address (via an IP route rule). In that case, a request
// from a node to the VIP will not hit the loadbalancer but
// will loop back with the source IP set to the VIP. We
// need the following rules to allow requests from this node.
if allowFromNode {
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
sources = append(sources, ",", lbip)
}
}
tx.Add(&knftables.Rule{
Chain: fwChain,
Rule: knftables.Concat(
ipX, "saddr", "!=", "{", sources, "}",
"drop",
),
})
}
// Capture load-balancer ingress.
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
if hasEndpoints {
@ -1224,53 +1242,19 @@ func (proxier *Proxier) syncProxyRules() {
})
}
if len(svcInfo.LoadBalancerSourceRanges()) > 0 {
if usesFWChain {
tx.Add(&knftables.Element{
Set: kubeFirewallSet,
Map: kubeFirewallIPsMap,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
},
Value: []string{
fmt.Sprintf("goto %s", fwChain),
},
Comment: &svcPortNameString,
})
allowFromNode := false
for _, src := range svcInfo.LoadBalancerSourceRanges() {
_, cidr, _ := netutils.ParseCIDRSloppy(src)
if cidr == nil {
continue
}
tx.Add(&knftables.Element{
Set: kubeFirewallAllowSet,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
src,
},
Comment: &svcPortNameString,
})
if cidr.Contains(proxier.nodeIP) {
allowFromNode = true
}
}
// For VIP-like LBs, the VIP is often added as a local
// address (via an IP route rule). In that case, a request
// from a node to the VIP will not hit the loadbalancer but
// will loop back with the source IP set to the VIP. We
// need the following rules to allow requests from this node.
if allowFromNode {
tx.Add(&knftables.Element{
Set: kubeFirewallAllowSet,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
lbip,
},
})
}
}
}
if !hasExternalEndpoints {

View File

@ -524,13 +524,9 @@ func TestOverallNFTablesRules(t *testing.T) {
add chain ip kube-proxy nat-prerouting { type nat hook prerouting priority -100 ; }
add rule ip kube-proxy nat-prerouting jump services
add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; }
add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add chain ip kube-proxy firewall-check
add chain ip kube-proxy firewall-allow-check
add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
add rule ip kube-proxy firewall-allow-check drop
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
add chain ip kube-proxy reject-chain { comment "helper for @no-endpoint-services / @no-endpoint-nodeports" ; }
add rule ip kube-proxy reject-chain reject
@ -625,11 +621,13 @@ func TestOverallNFTablesRules(t *testing.T) {
add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 update @affinity-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 { ip saddr }
add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 meta l4proto tcp dnat to 10.180.0.3:80
add chain ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80
add rule ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80 ip saddr != { 203.0.113.0/25 } drop
add element ip kube-proxy service-ips { 172.30.0.45 . tcp . 80 : goto service-HVFWP5L3-ns5/svc5/tcp/p80 }
add element ip kube-proxy service-ips { 5.6.7.8 . tcp . 80 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 }
add element ip kube-proxy service-nodeports { tcp . 3002 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 }
add element ip kube-proxy firewall { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" }
add element ip kube-proxy firewall-allow { 5.6.7.8 . tcp . 80 . 203.0.113.0/25 comment "ns5/svc5:p80" }
add element ip kube-proxy firewall-ips { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" : goto firewall-HVFWP5L3-ns5/svc5/tcp/p80 }
# svc6
add element ip kube-proxy no-endpoint-services { 172.30.0.46 . tcp . 80 comment "ns6/svc6:p80" : goto reject-chain }
@ -4267,7 +4265,6 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
add chain ip kube-proxy filter-forward { type filter hook forward priority -101 ; }
add chain ip kube-proxy filter-input { type filter hook input priority -101 ; }
add chain ip kube-proxy filter-output { type filter hook output priority -101 ; }
add chain ip kube-proxy firewall-allow-check
add chain ip kube-proxy firewall-check
add chain ip kube-proxy forward
add chain ip kube-proxy mark-for-masquerade
@ -4287,9 +4284,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
add rule ip kube-proxy filter-input ct state new jump firewall-check
add rule ip kube-proxy filter-output ct state new jump endpoints-check
add rule ip kube-proxy filter-output ct state new jump firewall-check
add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
add rule ip kube-proxy firewall-allow-check drop
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
add rule ip kube-proxy forward ct state invalid drop
add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000
add rule ip kube-proxy masquerading mark and 0x4000 == 0 return
@ -4302,8 +4297,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
add rule ip kube-proxy services ip daddr . meta l4proto . th dport vmap @service-ips
add rule ip kube-proxy services fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @service-nodeports
add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; }
add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add map ip kube-proxy no-endpoint-nodeports { type inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to service nodeports with no endpoints" ; }
add map ip kube-proxy no-endpoint-services { type ipv4_addr . inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to services with no endpoints" ; }
add map ip kube-proxy service-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "ClusterIP, ExternalIP and LoadBalancer IP traffic" ; }