mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-07 03:03:59 +00:00
kube-proxy: change implementation of LoadBalancerSourceRanges for wider kernel support
The nftables implementation made use of concatenation of ranges when creating the set "firewall-allow", but the support was not available before kernel 5.6. Therefore, nftables mode couldn't run on earlier kernels, while 5.4 is still widely used. An alternative of concatenation of ranges is to create a separate firewall chain for every service port that needs firewalling, and jump to the service's firewall chain from the common firewall chain via a rule with vmap. Renaming from "firewall" to "firewall-ips" is required when changing the set to the map to support existing clusters to upgrade, otherwise it would fail to create the map. Besides, "firewall-ips" corresponds to the "service-ips" map, later we can add use "firewall-nodeports" if it's determined that NodePort traffic should be subject to LoadBalancerSourceRanges. Signed-off-by: Quan Tian <qtian@vmware.com>
This commit is contained in:
parent
40c729c680
commit
377f521038
@ -207,6 +207,17 @@ func (tracer *nftablesTracer) addressMatches(ipStr, not, ruleAddress string) boo
|
||||
}
|
||||
}
|
||||
|
||||
func (tracer *nftablesTracer) noneAddressesMatch(ipStr, ruleAddress string) bool {
|
||||
ruleAddress = strings.ReplaceAll(ruleAddress, " ", "")
|
||||
addresses := strings.Split(ruleAddress, ",")
|
||||
for _, address := range addresses {
|
||||
if tracer.addressMatches(ipStr, "", address) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// matchDestIPOnly checks an "ip daddr" against a set/map, and returns the matching
|
||||
// Element, if found.
|
||||
func (tracer *nftablesTracer) matchDestIPOnly(elements []*knftables.Element, destIP string) *knftables.Element {
|
||||
@ -264,6 +275,7 @@ func (tracer *nftablesTracer) matchDestPort(elements []*knftables.Element, proto
|
||||
// match verdictRegexp.
|
||||
|
||||
var destAddrRegexp = regexp.MustCompile(`^ip6* daddr (!= )?(\S+)`)
|
||||
var destAddrLookupRegexp = regexp.MustCompile(`^ip6* daddr != \{([^}]*)\}`)
|
||||
var destAddrLocalRegexp = regexp.MustCompile(`^fib daddr type local`)
|
||||
var destPortRegexp = regexp.MustCompile(`^(tcp|udp|sctp) dport (\d+)`)
|
||||
var destIPOnlyLookupRegexp = regexp.MustCompile(`^ip6* daddr @(\S+)`)
|
||||
@ -275,6 +287,7 @@ var destDispatchRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th d
|
||||
var destPortDispatchRegexp = regexp.MustCompile(`^meta l4proto \. th dport vmap @(\S+)$`)
|
||||
|
||||
var sourceAddrRegexp = regexp.MustCompile(`^ip6* saddr (!= )?(\S+)`)
|
||||
var sourceAddrLookupRegexp = regexp.MustCompile(`^ip6* saddr != \{([^}]*)\}`)
|
||||
var sourceAddrLocalRegexp = regexp.MustCompile(`^fib saddr type local`)
|
||||
|
||||
var endpointVMAPRegexp = regexp.MustCompile(`^numgen random mod \d+ vmap \{(.*)\}$`)
|
||||
@ -397,6 +410,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP
|
||||
rule = element.Value[0]
|
||||
}
|
||||
|
||||
case destAddrLookupRegexp.MatchString(rule):
|
||||
// `^ip6* daddr != \{([^}]*)\}`
|
||||
// Tests whether destIP doesn't match an anonymous set.
|
||||
match := destAddrLookupRegexp.FindStringSubmatch(rule)
|
||||
rule = strings.TrimPrefix(rule, match[0])
|
||||
if !tracer.noneAddressesMatch(destIP, match[1]) {
|
||||
rule = ""
|
||||
break
|
||||
}
|
||||
|
||||
case destAddrRegexp.MatchString(rule):
|
||||
// `^ip6* daddr (!= )?(\S+)`
|
||||
// Tests whether destIP does/doesn't match a literal.
|
||||
@ -429,6 +452,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP
|
||||
break
|
||||
}
|
||||
|
||||
case sourceAddrLookupRegexp.MatchString(rule):
|
||||
// `^ip6* saddr != \{([^}]*)\}`
|
||||
// Tests whether sourceIP doesn't match an anonymous set.
|
||||
match := sourceAddrLookupRegexp.FindStringSubmatch(rule)
|
||||
rule = strings.TrimPrefix(rule, match[0])
|
||||
if !tracer.noneAddressesMatch(sourceIP, match[1]) {
|
||||
rule = ""
|
||||
break
|
||||
}
|
||||
|
||||
case sourceAddrRegexp.MatchString(rule):
|
||||
// `^ip6* saddr (!= )?(\S+)`
|
||||
// Tests whether sourceIP does/doesn't match a literal.
|
||||
|
@ -76,10 +76,8 @@ const (
|
||||
kubeRejectChain = "reject-chain"
|
||||
|
||||
// LoadBalancerSourceRanges handling
|
||||
kubeFirewallSet = "firewall"
|
||||
kubeFirewallCheckChain = "firewall-check"
|
||||
kubeFirewallAllowSet = "firewall-allow"
|
||||
kubeFirewallAllowCheckChain = "firewall-allow-check"
|
||||
kubeFirewallIPsMap = "firewall-ips"
|
||||
kubeFirewallCheckChain = "firewall-check"
|
||||
|
||||
// masquerading
|
||||
kubeMarkMasqChain = "mark-for-masquerade"
|
||||
@ -99,6 +97,7 @@ type servicePortInfo struct {
|
||||
clusterPolicyChainName string
|
||||
localPolicyChainName string
|
||||
externalChainName string
|
||||
firewallChainName string
|
||||
}
|
||||
|
||||
// returns a new proxy.ServicePort which abstracts a serviceInfo
|
||||
@ -114,6 +113,7 @@ func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *pro
|
||||
svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase
|
||||
svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase
|
||||
svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase
|
||||
svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase
|
||||
|
||||
return svcPort
|
||||
}
|
||||
@ -543,38 +543,20 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
|
||||
}
|
||||
|
||||
// Set up LoadBalancerSourceRanges firewalling
|
||||
tx.Add(&knftables.Set{
|
||||
Name: kubeFirewallSet,
|
||||
Type: ipvX_addr + " . inet_proto . inet_service",
|
||||
tx.Add(&knftables.Map{
|
||||
Name: kubeFirewallIPsMap,
|
||||
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
|
||||
Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
|
||||
})
|
||||
tx.Add(&knftables.Set{
|
||||
Name: kubeFirewallAllowSet,
|
||||
Type: ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr,
|
||||
Flags: []knftables.SetFlag{knftables.IntervalFlag},
|
||||
Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"),
|
||||
})
|
||||
|
||||
ensureChain(kubeFirewallCheckChain, tx, createdChains)
|
||||
ensureChain(kubeFirewallAllowCheckChain, tx, createdChains)
|
||||
tx.Add(&knftables.Rule{
|
||||
Chain: kubeFirewallCheckChain,
|
||||
Rule: knftables.Concat(
|
||||
ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet,
|
||||
"jump", kubeFirewallAllowCheckChain,
|
||||
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
|
||||
"vmap", "@", kubeFirewallIPsMap,
|
||||
),
|
||||
})
|
||||
tx.Add(&knftables.Rule{
|
||||
Chain: kubeFirewallAllowCheckChain,
|
||||
Rule: knftables.Concat(
|
||||
ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet,
|
||||
"return",
|
||||
),
|
||||
})
|
||||
tx.Add(&knftables.Rule{
|
||||
Chain: kubeFirewallAllowCheckChain,
|
||||
Rule: "drop",
|
||||
})
|
||||
|
||||
// Set up service dispatch
|
||||
tx.Add(&knftables.Map{
|
||||
@ -824,6 +806,7 @@ const (
|
||||
serviceExternalChainNamePrefix = "external-"
|
||||
servicePortEndpointChainNamePrefix = "endpoint-"
|
||||
servicePortEndpointAffinityNamePrefix = "affinity-"
|
||||
servicePortFirewallChainNamePrefix = "firewall-"
|
||||
)
|
||||
|
||||
// hashAndTruncate prefixes name with a hash of itself and then truncates to
|
||||
@ -998,11 +981,8 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
}
|
||||
|
||||
// We currently fully-rebuild our sets and maps on each resync
|
||||
tx.Flush(&knftables.Set{
|
||||
Name: kubeFirewallSet,
|
||||
})
|
||||
tx.Flush(&knftables.Set{
|
||||
Name: kubeFirewallAllowSet,
|
||||
tx.Flush(&knftables.Map{
|
||||
Name: kubeFirewallIPsMap,
|
||||
})
|
||||
tx.Flush(&knftables.Map{
|
||||
Name: kubeNoEndpointServicesMap,
|
||||
@ -1205,6 +1185,44 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
}
|
||||
}
|
||||
|
||||
usesFWChain := len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0
|
||||
fwChain := svcInfo.firewallChainName
|
||||
if usesFWChain {
|
||||
ensureChain(fwChain, tx, activeChains)
|
||||
var sources []string
|
||||
allowFromNode := false
|
||||
for _, src := range svcInfo.LoadBalancerSourceRanges() {
|
||||
_, cidr, _ := netutils.ParseCIDRSloppy(src)
|
||||
if cidr == nil {
|
||||
continue
|
||||
}
|
||||
if len(sources) > 0 {
|
||||
sources = append(sources, ",")
|
||||
}
|
||||
sources = append(sources, src)
|
||||
if cidr.Contains(proxier.nodeIP) {
|
||||
allowFromNode = true
|
||||
}
|
||||
}
|
||||
// For VIP-like LBs, the VIP is often added as a local
|
||||
// address (via an IP route rule). In that case, a request
|
||||
// from a node to the VIP will not hit the loadbalancer but
|
||||
// will loop back with the source IP set to the VIP. We
|
||||
// need the following rules to allow requests from this node.
|
||||
if allowFromNode {
|
||||
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
|
||||
sources = append(sources, ",", lbip)
|
||||
}
|
||||
}
|
||||
tx.Add(&knftables.Rule{
|
||||
Chain: fwChain,
|
||||
Rule: knftables.Concat(
|
||||
ipX, "saddr", "!=", "{", sources, "}",
|
||||
"drop",
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
// Capture load-balancer ingress.
|
||||
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
|
||||
if hasEndpoints {
|
||||
@ -1221,53 +1239,19 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
})
|
||||
}
|
||||
|
||||
if len(svcInfo.LoadBalancerSourceRanges()) > 0 {
|
||||
if usesFWChain {
|
||||
tx.Add(&knftables.Element{
|
||||
Set: kubeFirewallSet,
|
||||
Map: kubeFirewallIPsMap,
|
||||
Key: []string{
|
||||
lbip,
|
||||
protocol,
|
||||
strconv.Itoa(svcInfo.Port()),
|
||||
},
|
||||
Value: []string{
|
||||
fmt.Sprintf("goto %s", fwChain),
|
||||
},
|
||||
Comment: &svcPortNameString,
|
||||
})
|
||||
|
||||
allowFromNode := false
|
||||
for _, src := range svcInfo.LoadBalancerSourceRanges() {
|
||||
_, cidr, _ := netutils.ParseCIDRSloppy(src)
|
||||
if cidr == nil {
|
||||
continue
|
||||
}
|
||||
tx.Add(&knftables.Element{
|
||||
Set: kubeFirewallAllowSet,
|
||||
Key: []string{
|
||||
lbip,
|
||||
protocol,
|
||||
strconv.Itoa(svcInfo.Port()),
|
||||
src,
|
||||
},
|
||||
Comment: &svcPortNameString,
|
||||
})
|
||||
if cidr.Contains(proxier.nodeIP) {
|
||||
allowFromNode = true
|
||||
}
|
||||
}
|
||||
// For VIP-like LBs, the VIP is often added as a local
|
||||
// address (via an IP route rule). In that case, a request
|
||||
// from a node to the VIP will not hit the loadbalancer but
|
||||
// will loop back with the source IP set to the VIP. We
|
||||
// need the following rules to allow requests from this node.
|
||||
if allowFromNode {
|
||||
tx.Add(&knftables.Element{
|
||||
Set: kubeFirewallAllowSet,
|
||||
Key: []string{
|
||||
lbip,
|
||||
protocol,
|
||||
strconv.Itoa(svcInfo.Port()),
|
||||
lbip,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
if !hasExternalEndpoints {
|
||||
|
@ -521,13 +521,9 @@ func TestOverallNFTablesRules(t *testing.T) {
|
||||
add chain ip kube-proxy nat-prerouting { type nat hook prerouting priority -100 ; }
|
||||
add rule ip kube-proxy nat-prerouting jump services
|
||||
|
||||
add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
|
||||
add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; }
|
||||
add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
|
||||
add chain ip kube-proxy firewall-check
|
||||
add chain ip kube-proxy firewall-allow-check
|
||||
add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
|
||||
add rule ip kube-proxy firewall-allow-check drop
|
||||
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
|
||||
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
|
||||
|
||||
add chain ip kube-proxy reject-chain { comment "helper for @no-endpoint-services / @no-endpoint-nodeports" ; }
|
||||
add rule ip kube-proxy reject-chain reject
|
||||
@ -622,11 +618,13 @@ func TestOverallNFTablesRules(t *testing.T) {
|
||||
add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 update @affinity-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 { ip saddr }
|
||||
add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 meta l4proto tcp dnat to 10.180.0.3:80
|
||||
|
||||
add chain ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80
|
||||
add rule ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80 ip saddr != { 203.0.113.0/25 } drop
|
||||
|
||||
add element ip kube-proxy service-ips { 172.30.0.45 . tcp . 80 : goto service-HVFWP5L3-ns5/svc5/tcp/p80 }
|
||||
add element ip kube-proxy service-ips { 5.6.7.8 . tcp . 80 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 }
|
||||
add element ip kube-proxy service-nodeports { tcp . 3002 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 }
|
||||
add element ip kube-proxy firewall { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" }
|
||||
add element ip kube-proxy firewall-allow { 5.6.7.8 . tcp . 80 . 203.0.113.0/25 comment "ns5/svc5:p80" }
|
||||
add element ip kube-proxy firewall-ips { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" : goto firewall-HVFWP5L3-ns5/svc5/tcp/p80 }
|
||||
|
||||
# svc6
|
||||
add element ip kube-proxy no-endpoint-services { 172.30.0.46 . tcp . 80 comment "ns6/svc6:p80" : goto reject-chain }
|
||||
@ -4264,7 +4262,6 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
|
||||
add chain ip kube-proxy filter-forward { type filter hook forward priority -101 ; }
|
||||
add chain ip kube-proxy filter-input { type filter hook input priority -101 ; }
|
||||
add chain ip kube-proxy filter-output { type filter hook output priority -101 ; }
|
||||
add chain ip kube-proxy firewall-allow-check
|
||||
add chain ip kube-proxy firewall-check
|
||||
add chain ip kube-proxy forward
|
||||
add chain ip kube-proxy mark-for-masquerade
|
||||
@ -4284,9 +4281,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
|
||||
add rule ip kube-proxy filter-input ct state new jump firewall-check
|
||||
add rule ip kube-proxy filter-output ct state new jump endpoints-check
|
||||
add rule ip kube-proxy filter-output ct state new jump firewall-check
|
||||
add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return
|
||||
add rule ip kube-proxy firewall-allow-check drop
|
||||
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check
|
||||
add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips
|
||||
add rule ip kube-proxy forward ct state invalid drop
|
||||
add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000
|
||||
add rule ip kube-proxy masquerading mark and 0x4000 == 0 return
|
||||
@ -4299,8 +4294,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) {
|
||||
add rule ip kube-proxy services ip daddr . meta l4proto . th dport vmap @service-ips
|
||||
add rule ip kube-proxy services fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @service-nodeports
|
||||
|
||||
add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
|
||||
add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; }
|
||||
add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; }
|
||||
add map ip kube-proxy no-endpoint-nodeports { type inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to service nodeports with no endpoints" ; }
|
||||
add map ip kube-proxy no-endpoint-services { type ipv4_addr . inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to services with no endpoints" ; }
|
||||
add map ip kube-proxy service-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "ClusterIP, ExternalIP and LoadBalancer IP traffic" ; }
|
||||
|
Loading…
Reference in New Issue
Block a user