kube-proxy: change implementation of LoadBalancerSourceRanges for wider kernel support

The nftables implementation made use of concatenation of ranges when
creating the set "firewall-allow", but the support was not available
before kernel 5.6. Therefore, nftables mode couldn't run on earlier
kernels, while 5.4 is still widely used.

An alternative of concatenation of ranges is to create a separate
firewall chain for every service port that needs firewalling, and jump
to the service's firewall chain from the common firewall chain via a
rule with vmap.

Renaming from "firewall" to "firewall-ips" is required when changing the
set to the map to support existing clusters to upgrade, otherwise it
would fail to create the map. Besides, "firewall-ips" corresponds to the
"service-ips" map, later we can add use "firewall-nodeports" if it's
determined that NodePort traffic should be subject to
LoadBalancerSourceRanges.

Signed-off-by: Quan Tian <qtian@vmware.com>
This commit is contained in:
Quan Tian 2023-12-11 17:38:11 +08:00
parent 40c729c680
commit 377f521038
3 changed files with 96 additions and 85 deletions

View File

@ -207,6 +207,17 @@ func (tracer *nftablesTracer) addressMatches(ipStr, not, ruleAddress string) boo
}
}
func (tracer *nftablesTracer) noneAddressesMatch(ipStr, ruleAddress string) bool {
ruleAddress = strings.ReplaceAll(ruleAddress, " ", "")
addresses := strings.Split(ruleAddress, ",")
for _, address := range addresses {
if tracer.addressMatches(ipStr, "", address) {
return false
}
}
return true
}
// matchDestIPOnly checks an "ip daddr" against a set/map, and returns the matching
// Element, if found.
func (tracer *nftablesTracer) matchDestIPOnly(elements []*knftables.Element, destIP string) *knftables.Element {
@ -264,6 +275,7 @@ func (tracer *nftablesTracer) matchDestPort(elements []*knftables.Element, proto
// match verdictRegexp.
var destAddrRegexp = regexp.MustCompile(`^ip6* daddr (!= )?(\S+)`)
var destAddrLookupRegexp = regexp.MustCompile(`^ip6* daddr != \{([^}]*)\}`)
var destAddrLocalRegexp = regexp.MustCompile(`^fib daddr type local`)
var destPortRegexp = regexp.MustCompile(`^(tcp|udp|sctp) dport (\d+)`)
var destIPOnlyLookupRegexp = regexp.MustCompile(`^ip6* daddr @(\S+)`)
@ -275,6 +287,7 @@ var destDispatchRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th d
var destPortDispatchRegexp = regexp.MustCompile(`^meta l4proto \. th dport vmap @(\S+)$`)
var sourceAddrRegexp = regexp.MustCompile(`^ip6* saddr (!= )?(\S+)`)
var sourceAddrLookupRegexp = regexp.MustCompile(`^ip6* saddr != \{([^}]*)\}`)
var sourceAddrLocalRegexp = regexp.MustCompile(`^fib saddr type local`)
var endpointVMAPRegexp = regexp.MustCompile(`^numgen random mod \d+ vmap \{(.*)\}$`)
@ -397,6 +410,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP
rule = element.Value[0]
}
case destAddrLookupRegexp.MatchString(rule):
// `^ip6* daddr != \{([^}]*)\}`
// Tests whether destIP doesn't match an anonymous set.
match := destAddrLookupRegexp.FindStringSubmatch(rule)
rule = strings.TrimPrefix(rule, match[0])
if !tracer.noneAddressesMatch(destIP, match[1]) {
rule = ""
break
}
case destAddrRegexp.MatchString(rule):
// `^ip6* daddr (!= )?(\S+)`
// Tests whether destIP does/doesn't match a literal.
@ -429,6 +452,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP
break
}
case sourceAddrLookupRegexp.MatchString(rule):
// `^ip6* saddr != \{([^}]*)\}`
// Tests whether sourceIP doesn't match an anonymous set.
match := sourceAddrLookupRegexp.FindStringSubmatch(rule)
rule = strings.TrimPrefix(rule, match[0])
if !tracer.noneAddressesMatch(sourceIP, match[1]) {
rule = ""
break
}
case sourceAddrRegexp.MatchString(rule):
// `^ip6* saddr (!= )?(\S+)`
// Tests whether sourceIP does/doesn't match a literal.

View File

@ -76,10 +76,8 @@ const (
kubeRejectChain = "reject-chain"
// LoadBalancerSourceRanges handling
kubeFirewallSet = "firewall"
kubeFirewallCheckChain = "firewall-check"
kubeFirewallAllowSet = "firewall-allow"
kubeFirewallAllowCheckChain = "firewall-allow-check"
kubeFirewallIPsMap = "firewall-ips"
kubeFirewallCheckChain = "firewall-check"
// masquerading
kubeMarkMasqChain = "mark-for-masquerade"
@ -99,6 +97,7 @@ type servicePortInfo struct {
clusterPolicyChainName string
localPolicyChainName string
externalChainName string
firewallChainName string
}
// returns a new proxy.ServicePort which abstracts a serviceInfo
@ -114,6 +113,7 @@ func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *pro
svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase
svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase
svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase
svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase
return svcPort
}
@ -543,38 +543,20 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
}
// Set up LoadBalancerSourceRanges firewalling
tx.Add(&knftables.Set{
Name: kubeFirewallSet,
Type: ipvX_addr + " . inet_proto . inet_service",
tx.Add(&knftables.Map{
Name: kubeFirewallIPsMap,
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
})
tx.Add(&knftables.Set{
Name: kubeFirewallAllowSet,
Type: ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr,
Flags: []knftables.SetFlag{knftables.IntervalFlag},
Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"),
})
ensureChain(kubeFirewallCheckChain, tx, createdChains)
ensureChain(kubeFirewallAllowCheckChain, tx, createdChains)
tx.Add(&knftables.Rule{
Chain: kubeFirewallCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet,
"jump", kubeFirewallAllowCheckChain,
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
"vmap", "@", kubeFirewallIPsMap,
),
})
tx.Add(&knftables.Rule{
Chain: kubeFirewallAllowCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet,
"return",
),
})
tx.Add(&knftables.Rule{
Chain: kubeFirewallAllowCheckChain,
Rule: "drop",
})
// Set up service dispatch
tx.Add(&knftables.Map{
@ -824,6 +806,7 @@ const (
serviceExternalChainNamePrefix = "external-"
servicePortEndpointChainNamePrefix = "endpoint-"
servicePortEndpointAffinityNamePrefix = "affinity-"
servicePortFirewallChainNamePrefix = "firewall-"
)
// hashAndTruncate prefixes name with a hash of itself and then truncates to
@ -998,11 +981,8 @@ func (proxier *Proxier) syncProxyRules() {
}
// We currently fully-rebuild our sets and maps on each resync
tx.Flush(&knftables.Set{
Name: kubeFirewallSet,
})
tx.Flush(&knftables.Set{
Name: kubeFirewallAllowSet,
tx.Flush(&knftables.Map{
Name: kubeFirewallIPsMap,
})
tx.Flush(&knftables.Map{
Name: kubeNoEndpointServicesMap,
@ -1205,6 +1185,44 @@ func (proxier *Proxier) syncProxyRules() {
}
}
usesFWChain := len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0
fwChain := svcInfo.firewallChainName
if usesFWChain {
ensureChain(fwChain, tx, activeChains)
var sources []string
allowFromNode := false
for _, src := range svcInfo.LoadBalancerSourceRanges() {
_, cidr, _ := netutils.ParseCIDRSloppy(src)
if cidr == nil {
continue
}
if len(sources) > 0 {
sources = append(sources, ",")
}
sources = append(sources, src)
if cidr.Contains(proxier.nodeIP) {
allowFromNode = true
}
}
// For VIP-like LBs, the VIP is often added as a local
// address (via an IP route rule). In that case, a request
// from a node to the VIP will not hit the loadbalancer but
// will loop back with the source IP set to the VIP. We
// need the following rules to allow requests from this node.
if allowFromNode {
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
sources = append(sources, ",", lbip)
}
}
tx.Add(&knftables.Rule{
Chain: fwChain,
Rule: knftables.Concat(
ipX, "saddr", "!=", "{", sources, "}",
"drop",
),
})
}
// Capture load-balancer ingress.
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
if hasEndpoints {
@ -1221,53 +1239,19 @@ func (proxier *Proxier) syncProxyRules() {
})
}
if len(svcInfo.LoadBalancerSourceRanges()) > 0 {
if usesFWChain {
tx.Add(&knftables.Element{
Set: kubeFirewallSet,
Map: kubeFirewallIPsMap,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
},
Value: []string{
fmt.Sprintf("goto %s", fwChain),
},
Comment: &svcPortNameString,
})
allowFromNode := false
for _, src := range svcInfo.LoadBalancerSourceRanges() {
_, cidr, _ := netutils.ParseCIDRSloppy(src)
if cidr == nil {
continue
}
tx.Add(&knftables.Element{
Set: kubeFirewallAllowSet,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
src,
},
Comment: &svcPortNameString,
})
if cidr.Contains(proxier.nodeIP) {
allowFromNode = true
}
}
// For VIP-like LBs, the VIP is often added as a local
// address (via an IP route rule). In that case, a request
// from a node to the VIP will not hit the loadbalancer but
// will loop back with the source IP set to the VIP. We
// need the following rules to allow requests from this node.
if allowFromNode {
tx.Add(&knftables.Element{
Set: kubeFirewallAllowSet,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
lbip,
},
})
}
}
}
if !hasExternalEndpoints {

View File

@ -521,13 +521,9 @@ func TestOverallNFTablesRules(t *testing.T) {
add chain ip kube-proxy nat-prerouting { type nat hook prerouting priority -100 ; }
add rule ip kube-proxy nat-prerouting jump services
add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; }
add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add chain ip kube-proxy firewall-check
add chain ip kube-proxy firewall-allow-check
add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
add rule ip kube-proxy firewall-allow-check drop
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
add chain ip kube-proxy reject-chain { comment "helper for @no-endpoint-services / @no-endpoint-nodeports" ; }
add rule ip kube-proxy reject-chain reject
@ -622,11 +618,13 @@ func TestOverallNFTablesRules(t *testing.T) {
add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 update @affinity-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 { ip saddr }
add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 meta l4proto tcp dnat to 10.180.0.3:80
add chain ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80
add rule ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80 ip saddr != { 203.0.113.0/25 } drop
add element ip kube-proxy service-ips { 172.30.0.45 . tcp . 80 : goto service-HVFWP5L3-ns5/svc5/tcp/p80 }
add element ip kube-proxy service-ips { 5.6.7.8 . tcp . 80 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 }
add element ip kube-proxy service-nodeports { tcp . 3002 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 }
add element ip kube-proxy firewall { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" }
add element ip kube-proxy firewall-allow { 5.6.7.8 . tcp . 80 . 203.0.113.0/25 comment "ns5/svc5:p80" }
add element ip kube-proxy firewall-ips { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" : goto firewall-HVFWP5L3-ns5/svc5/tcp/p80 }
# svc6
add element ip kube-proxy no-endpoint-services { 172.30.0.46 . tcp . 80 comment "ns6/svc6:p80" : goto reject-chain }
@ -4264,7 +4262,6 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
add chain ip kube-proxy filter-forward { type filter hook forward priority -101 ; }
add chain ip kube-proxy filter-input { type filter hook input priority -101 ; }
add chain ip kube-proxy filter-output { type filter hook output priority -101 ; }
add chain ip kube-proxy firewall-allow-check
add chain ip kube-proxy firewall-check
add chain ip kube-proxy forward
add chain ip kube-proxy mark-for-masquerade
@ -4284,9 +4281,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
add rule ip kube-proxy filter-input ct state new jump firewall-check
add rule ip kube-proxy filter-output ct state new jump endpoints-check
add rule ip kube-proxy filter-output ct state new jump firewall-check
add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
add rule ip kube-proxy firewall-allow-check drop
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
add rule ip kube-proxy forward ct state invalid drop
add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000
add rule ip kube-proxy masquerading mark and 0x4000 == 0 return
@ -4299,8 +4294,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
add rule ip kube-proxy services ip daddr . meta l4proto . th dport vmap @service-ips
add rule ip kube-proxy services fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @service-nodeports
add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; }
add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
add map ip kube-proxy no-endpoint-nodeports { type inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to service nodeports with no endpoints" ; }
add map ip kube-proxy no-endpoint-services { type ipv4_addr . inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to services with no endpoints" ; }
add map ip kube-proxy service-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "ClusterIP, ExternalIP and LoadBalancer IP traffic" ; }