diff --git a/pkg/proxy/nftables/helpers_test.go b/pkg/proxy/nftables/helpers_test.go index faa3443942e..a85b873ccd1 100644 --- a/pkg/proxy/nftables/helpers_test.go +++ b/pkg/proxy/nftables/helpers_test.go @@ -62,8 +62,6 @@ var objectOrder = map[string]int{ // with per-service rules, we don't know what order syncProxyRules is going to output them // in, but the order doesn't matter anyway. So we sort the rules in those chains. var sortedChains = sets.New( - kubeServicesFilterChain, - kubeExternalServicesChain, kubeServicesChain, kubeNodePortsChain, ) @@ -256,6 +254,17 @@ func (tracer *nftablesTracer) matchDestAndSource(elements []*knftables.Element, return nil } +// matchDestPort checks an "meta l4proto . th dport" against a set/map, and returns the +// matching Element, if found. +func (tracer *nftablesTracer) matchDestPort(elements []*knftables.Element, protocol, destPort string) *knftables.Element { + for _, element := range elements { + if element.Key[0] == protocol && element.Key[1] == destPort { + return element + } + } + return nil +} + // We intentionally don't try to parse arbitrary nftables rules, as the syntax is quite // complicated and context sensitive. (E.g., "ip daddr" could be the start of an address // comparison, or it could be the start of a set/map lookup.) Instead, we just have @@ -273,6 +282,10 @@ var destPortRegexp = regexp.MustCompile(`^(tcp|udp|sctp) dport (\d+)`) var destIPOnlyLookupRegexp = regexp.MustCompile(`^ip6* daddr @(\S+)`) var destLookupRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th dport @(\S+)`) var destSourceLookupRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th dport \. ip6* saddr @(\S+)`) +var destPortLookupRegexp = regexp.MustCompile(`^meta l4proto \. th dport @(\S+)`) + +var destDispatchRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th dport vmap @(\S+)$`) +var destPortDispatchRegexp = regexp.MustCompile(`^meta l4proto \. th dport vmap @(\S+)$`) var sourceAddrRegexp = regexp.MustCompile(`^ip6* saddr (!= )?(\S+)`) var sourceAddrLocalRegexp = regexp.MustCompile(`^fib saddr type local`) @@ -357,6 +370,46 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP break } + case destPortLookupRegexp.MatchString(rule): + // `^meta l4proto . th dport @(\S+)` + // Tests whether "protocol . destPort" is a member of the + // indicated set. + match := destPortLookupRegexp.FindStringSubmatch(rule) + rule = strings.TrimPrefix(rule, match[0]) + set := match[1] + if tracer.matchDestPort(tracer.nft.Table.Sets[set].Elements, protocol, destPort) == nil { + rule = "" + break + } + + case destDispatchRegexp.MatchString(rule): + // `^ip6* daddr \. meta l4proto \. th dport vmap @(\S+)$` + // Looks up "destIP . protocol . destPort" in the indicated + // verdict map, and if found, runs the assocated verdict. + match := destDispatchRegexp.FindStringSubmatch(rule) + mapName := match[1] + element := tracer.matchDest(tracer.nft.Table.Maps[mapName].Elements, destIP, protocol, destPort) + if element == nil { + rule = "" + break + } else { + rule = element.Value[0] + } + + case destPortDispatchRegexp.MatchString(rule): + // `^meta l4proto \. th dport vmap @(\S+)$` + // Looks up "protocol . destPort" in the indicated verdict map, + // and if found, runs the assocated verdict. + match := destPortDispatchRegexp.FindStringSubmatch(rule) + mapName := match[1] + element := tracer.matchDestPort(tracer.nft.Table.Maps[mapName].Elements, protocol, destPort) + if element == nil { + rule = "" + break + } else { + rule = element.Value[0] + } + case destAddrRegexp.MatchString(rule): // `^ip6* daddr (!= )?(\S+)` // Tests whether destIP does/doesn't match a literal. diff --git a/pkg/proxy/nftables/proxier.go b/pkg/proxy/nftables/proxier.go index de5ca9b6e18..cc1a4449854 100644 --- a/pkg/proxy/nftables/proxier.go +++ b/pkg/proxy/nftables/proxier.go @@ -69,8 +69,10 @@ const ( kubeNodePortIPsSet = "nodeport-ips" // handling for services with no endpoints - kubeServicesFilterChain = "services-filter" - kubeExternalServicesChain = "external-services" + kubeEndpointsCheckChain = "endpoints-check" + kubeNoEndpointServicesMap = "no-endpoint-services" + kubeNoEndpointNodePortsMap = "no-endpoint-nodeports" + kubeRejectChain = "reject-chain" // LoadBalancerSourceRanges handling kubeFirewallSet = "firewall" @@ -340,10 +342,10 @@ type nftablesJumpChain struct { } var nftablesJumpChains = []nftablesJumpChain{ - {kubeExternalServicesChain, "filter-input", "ct state new"}, - {kubeExternalServicesChain, "filter-forward", "ct state new"}, - {kubeServicesFilterChain, "filter-forward", "ct state new"}, - {kubeServicesFilterChain, "filter-output", "ct state new"}, + {kubeEndpointsCheckChain, "filter-input", "ct state new"}, + {kubeEndpointsCheckChain, "filter-forward", "ct state new"}, + {kubeEndpointsCheckChain, "filter-output", "ct state new"}, + {kubeForwardChain, "filter-forward", ""}, {kubeFirewallCheckChain, "filter-input", "ct state new"}, @@ -374,9 +376,11 @@ func ensureChain(chain string, tx *knftables.Transaction, createdChains sets.Set func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { ipX := "ip" ipvX_addr := "ipv4_addr" //nolint:stylecheck // var name intentionally resembles value + noLocalhost := "ip daddr != 127.0.0.0/8" if proxier.ipFamily == v1.IPv6Protocol { ipX = "ip6" ipvX_addr = "ipv6_addr" + noLocalhost = "ip6 daddr != ::1" } tx.Add(&knftables.Table{ @@ -409,7 +413,7 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { } // Ensure all of our other "top-level" chains exist - for _, chain := range []string{kubeServicesFilterChain, kubeServicesChain, kubeExternalServicesChain, kubeForwardChain, kubeNodePortsChain, kubeMasqueradingChain, kubeMarkMasqChain} { + for _, chain := range []string{kubeServicesChain, kubeForwardChain, kubeNodePortsChain, kubeMasqueradingChain, kubeMarkMasqChain} { ensureChain(chain, tx, createdChains) } @@ -484,6 +488,59 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { } } + // Set up "no endpoints" drop/reject handling + tx.Add(&knftables.Map{ + Name: kubeNoEndpointServicesMap, + Type: ipvX_addr + " . inet_proto . inet_service : verdict", + Comment: ptr.To("vmap to drop or reject packets to services with no endpoints"), + }) + tx.Add(&knftables.Map{ + Name: kubeNoEndpointNodePortsMap, + Type: "inet_proto . inet_service : verdict", + Comment: ptr.To("vmap to drop or reject packets to service nodeports with no endpoints"), + }) + + tx.Add(&knftables.Chain{ + Name: kubeRejectChain, + Comment: ptr.To("helper for @no-endpoint-services / @no-endpoint-nodeports"), + }) + tx.Flush(&knftables.Chain{ + Name: kubeRejectChain, + }) + tx.Add(&knftables.Rule{ + Chain: kubeRejectChain, + Rule: "reject", + }) + + tx.Add(&knftables.Rule{ + Chain: kubeEndpointsCheckChain, + Rule: knftables.Concat( + ipX, "daddr", ".", "meta l4proto", ".", "th dport", + "vmap", "@", kubeNoEndpointServicesMap, + ), + }) + + if proxier.nodePortAddresses.MatchAll() { + tx.Add(&knftables.Rule{ + Chain: kubeEndpointsCheckChain, + Rule: knftables.Concat( + "fib daddr type local", + noLocalhost, + "meta l4proto . th dport", + "vmap", "@", kubeNoEndpointNodePortsMap, + ), + }) + } else { + tx.Add(&knftables.Rule{ + Chain: kubeEndpointsCheckChain, + Rule: knftables.Concat( + ipX, "daddr", "@", kubeNodePortIPsSet, + "meta l4proto . th dport", + "vmap", "@", kubeNoEndpointNodePortsMap, + ), + }) + } + // Set up LoadBalancerSourceRanges firewalling tx.Add(&knftables.Set{ Name: kubeFirewallSet, @@ -907,6 +964,12 @@ func (proxier *Proxier) syncProxyRules() { tx.Flush(&knftables.Set{ Name: kubeFirewallAllowSet, }) + tx.Flush(&knftables.Map{ + Name: kubeNoEndpointServicesMap, + }) + tx.Flush(&knftables.Map{ + Name: kubeNoEndpointNodePortsMap, + }) // Accumulate service/endpoint chains and affinity sets to keep. activeChains := sets.New[string]() @@ -1004,25 +1067,21 @@ func (proxier *Proxier) syncProxyRules() { ensureChain(externalTrafficChain, tx, activeChains) } - var internalTrafficFilterVerdict, internalTrafficFilterComment string - var externalTrafficFilterVerdict, externalTrafficFilterComment string + var internalTrafficFilterVerdict, externalTrafficFilterVerdict string if !hasEndpoints { // The service has no endpoints at all; hasInternalEndpoints and // hasExternalEndpoints will also be false, and we will not // generate any chains in the "nat" table for the service; only // rules in the "filter" table rejecting incoming packets for // the service's IPs. - internalTrafficFilterVerdict = "reject" - internalTrafficFilterComment = fmt.Sprintf("%s has no endpoints", svcPortNameString) - externalTrafficFilterVerdict = "reject" - externalTrafficFilterComment = internalTrafficFilterComment + internalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain) + externalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain) } else { if !hasInternalEndpoints { // The internalTrafficPolicy is "Local" but there are no local // endpoints. Traffic to the clusterIP will be dropped, but // external traffic may still be accepted. internalTrafficFilterVerdict = "drop" - internalTrafficFilterComment = fmt.Sprintf("%s has no local endpoints", svcPortNameString) serviceNoLocalEndpointsTotalInternal++ } if !hasExternalEndpoints { @@ -1031,7 +1090,6 @@ func (proxier *Proxier) syncProxyRules() { // the cluster will be dropped, but traffic from inside // the cluster may still be accepted. externalTrafficFilterVerdict = "drop" - externalTrafficFilterComment = fmt.Sprintf("%s has no local endpoints", svcPortNameString) serviceNoLocalEndpointsTotalExternal++ } } @@ -1048,14 +1106,17 @@ func (proxier *Proxier) syncProxyRules() { }) } else { // No endpoints. - tx.Add(&knftables.Rule{ - Chain: kubeServicesFilterChain, - Rule: knftables.Concat( - ipX, "daddr", svcInfo.ClusterIP(), - protocol, "dport", svcInfo.Port(), + tx.Add(&knftables.Element{ + Map: kubeNoEndpointServicesMap, + Key: []string{ + svcInfo.ClusterIP().String(), + protocol, + strconv.Itoa(svcInfo.Port()), + }, + Value: []string{ internalTrafficFilterVerdict, - ), - Comment: &internalTrafficFilterComment, + }, + Comment: &svcPortNameString, }) } @@ -1077,14 +1138,17 @@ func (proxier *Proxier) syncProxyRules() { // Either no endpoints at all (REJECT) or no endpoints for // external traffic (DROP anything that didn't get // short-circuited by the EXT chain.) - tx.Add(&knftables.Rule{ - Chain: kubeExternalServicesChain, - Rule: knftables.Concat( - ipX, "daddr", externalIP, - protocol, "dport", svcInfo.Port(), + tx.Add(&knftables.Element{ + Map: kubeNoEndpointServicesMap, + Key: []string{ + externalIP, + protocol, + strconv.Itoa(svcInfo.Port()), + }, + Value: []string{ externalTrafficFilterVerdict, - ), - Comment: &externalTrafficFilterComment, + }, + Comment: &svcPortNameString, }) } } @@ -1156,14 +1220,17 @@ func (proxier *Proxier) syncProxyRules() { // external traffic (DROP anything that didn't get short-circuited // by the EXT chain.) for _, lbip := range svcInfo.LoadBalancerVIPStrings() { - tx.Add(&knftables.Rule{ - Chain: kubeExternalServicesChain, - Rule: knftables.Concat( - ipX, "daddr", lbip, - protocol, "dport", svcInfo.Port(), + tx.Add(&knftables.Element{ + Map: kubeNoEndpointServicesMap, + Key: []string{ + lbip, + protocol, + strconv.Itoa(svcInfo.Port()), + }, + Value: []string{ externalTrafficFilterVerdict, - ), - Comment: &externalTrafficFilterComment, + }, + Comment: &svcPortNameString, }) } } @@ -1186,14 +1253,16 @@ func (proxier *Proxier) syncProxyRules() { // Either no endpoints at all (REJECT) or no endpoints for // external traffic (DROP anything that didn't get // short-circuited by the EXT chain.) - tx.Add(&knftables.Rule{ - Chain: kubeExternalServicesChain, - Rule: knftables.Concat( - "fib daddr type local", - protocol, "dport", svcInfo.NodePort(), + tx.Add(&knftables.Element{ + Map: kubeNoEndpointNodePortsMap, + Key: []string{ + protocol, + strconv.Itoa(svcInfo.NodePort()), + }, + Value: []string{ externalTrafficFilterVerdict, - ), - Comment: &externalTrafficFilterComment, + }, + Comment: &svcPortNameString, }) } } diff --git a/pkg/proxy/nftables/proxier_test.go b/pkg/proxy/nftables/proxier_test.go index a66c46d44ce..7e81c4771ac 100644 --- a/pkg/proxy/nftables/proxier_test.go +++ b/pkg/proxy/nftables/proxier_test.go @@ -495,7 +495,6 @@ func TestOverallNFTablesRules(t *testing.T) { expected := dedent.Dedent(` add table ip kube-proxy { comment "rules for kube-proxy" ; } - add chain ip kube-proxy external-services add chain ip kube-proxy forward add rule ip kube-proxy forward ct state invalid drop add chain ip kube-proxy mark-for-masquerade @@ -506,17 +505,16 @@ func TestOverallNFTablesRules(t *testing.T) { add rule ip kube-proxy masquerading mark set mark xor 0x4000 add rule ip kube-proxy masquerading masquerade fully-random add chain ip kube-proxy services - add chain ip kube-proxy services-filter - add chain ip kube-proxy firewall-check - add chain ip kube-proxy firewall-allow-check add chain ip kube-proxy filter-forward { type filter hook forward priority -101 ; } - add rule ip kube-proxy filter-forward ct state new jump external-services - add rule ip kube-proxy filter-forward ct state new jump services-filter + add rule ip kube-proxy filter-forward ct state new jump endpoints-check add rule ip kube-proxy filter-forward jump forward + add rule ip kube-proxy filter-forward ct state new jump firewall-check add chain ip kube-proxy filter-input { type filter hook input priority -101 ; } - add rule ip kube-proxy filter-input ct state new jump external-services + add rule ip kube-proxy filter-input ct state new jump endpoints-check + add rule ip kube-proxy filter-input ct state new jump firewall-check add chain ip kube-proxy filter-output { type filter hook output priority -101 ; } - add rule ip kube-proxy filter-output ct state new jump services-filter + add rule ip kube-proxy filter-output ct state new jump endpoints-check + add rule ip kube-proxy filter-output ct state new jump firewall-check add chain ip kube-proxy nat-output { type nat hook output priority -100 ; } add rule ip kube-proxy nat-output jump services add chain ip kube-proxy nat-postrouting { type nat hook postrouting priority 100 ; } @@ -526,12 +524,21 @@ func TestOverallNFTablesRules(t *testing.T) { add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; } add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; } + add chain ip kube-proxy firewall-check + add chain ip kube-proxy firewall-allow-check add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return add rule ip kube-proxy firewall-allow-check drop add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check - add rule ip kube-proxy filter-forward ct state new jump firewall-check - add rule ip kube-proxy filter-input ct state new jump firewall-check - add rule ip kube-proxy filter-output ct state new jump firewall-check + + add chain ip kube-proxy reject-chain { comment "helper for @no-endpoint-services / @no-endpoint-nodeports" ; } + add rule ip kube-proxy reject-chain reject + + add map ip kube-proxy no-endpoint-services { type ipv4_addr . inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to services with no endpoints" ; } + add map ip kube-proxy no-endpoint-nodeports { type inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to service nodeports with no endpoints" ; } + + add chain ip kube-proxy endpoints-check + add rule ip kube-proxy endpoints-check ip daddr . meta l4proto . th dport vmap @no-endpoint-services + add rule ip kube-proxy endpoints-check fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @no-endpoint-nodeports # svc1 add chain ip kube-proxy service-ULMVA6XW-ns1/svc1/tcp/p80 @@ -558,11 +565,12 @@ func TestOverallNFTablesRules(t *testing.T) { add rule ip kube-proxy services ip daddr 172.30.0.42 tcp dport 80 goto service-42NFTM6N-ns2/svc2/tcp/p80 add rule ip kube-proxy services ip daddr 192.168.99.22 tcp dport 80 goto external-42NFTM6N-ns2/svc2/tcp/p80 add rule ip kube-proxy services ip daddr 1.2.3.4 tcp dport 80 goto external-42NFTM6N-ns2/svc2/tcp/p80 - add rule ip kube-proxy external-services ip daddr 192.168.99.22 tcp dport 80 drop comment "ns2/svc2:p80 has no local endpoints" - add rule ip kube-proxy external-services ip daddr 1.2.3.4 tcp dport 80 drop comment "ns2/svc2:p80 has no local endpoints" - add rule ip kube-proxy external-services fib daddr type local tcp dport 3001 drop comment "ns2/svc2:p80 has no local endpoints" add rule ip kube-proxy nodeports tcp dport 3001 goto external-42NFTM6N-ns2/svc2/tcp/p80 + add element ip kube-proxy no-endpoint-nodeports { tcp . 3001 comment "ns2/svc2:p80" : drop } + add element ip kube-proxy no-endpoint-services { 1.2.3.4 . tcp . 80 comment "ns2/svc2:p80" : drop } + add element ip kube-proxy no-endpoint-services { 192.168.99.22 . tcp . 80 comment "ns2/svc2:p80" : drop } + # svc3 add chain ip kube-proxy service-4AT6LBPK-ns3/svc3/tcp/p80 add rule ip kube-proxy service-4AT6LBPK-ns3/svc3/tcp/p80 ip daddr 172.30.0.43 tcp dport 80 ip saddr != 10.0.0.0/8 jump mark-for-masquerade @@ -613,7 +621,7 @@ func TestOverallNFTablesRules(t *testing.T) { add element ip kube-proxy firewall-allow { 5.6.7.8 . tcp . 80 . 203.0.113.0/25 comment "ns5/svc5:p80" } # svc6 - add rule ip kube-proxy services-filter ip daddr 172.30.0.46 tcp dport 80 reject comment "ns6/svc6:p80 has no endpoints" + add element ip kube-proxy no-endpoint-services { 172.30.0.46 . tcp . 80 comment "ns6/svc6:p80" : goto reject-chain } add rule ip kube-proxy services fib daddr type local ip daddr != 127.0.0.0/8 jump nodeports comment "kubernetes service nodeports; NOTE: this must be the last rule in this chain" `) @@ -4246,7 +4254,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) { baseRules := dedent.Dedent(` add table ip kube-proxy { comment "rules for kube-proxy" ; } - add chain ip kube-proxy external-services + add chain ip kube-proxy endpoints-check add chain ip kube-proxy filter-forward { type filter hook forward priority -101 ; } add chain ip kube-proxy filter-input { type filter hook input priority -101 ; } add chain ip kube-proxy filter-output { type filter hook output priority -101 ; } @@ -4259,16 +4267,17 @@ func TestSyncProxyRulesRepeated(t *testing.T) { add chain ip kube-proxy nat-postrouting { type nat hook postrouting priority 100 ; } add chain ip kube-proxy nat-prerouting { type nat hook prerouting priority -100 ; } add chain ip kube-proxy nodeports + add chain ip kube-proxy reject-chain { comment "helper for @no-endpoint-services / @no-endpoint-nodeports" ; } add chain ip kube-proxy services - add chain ip kube-proxy services-filter - add rule ip kube-proxy filter-forward ct state new jump external-services - add rule ip kube-proxy filter-forward ct state new jump services-filter + add rule ip kube-proxy endpoints-check ip daddr . meta l4proto . th dport vmap @no-endpoint-services + add rule ip kube-proxy endpoints-check fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @no-endpoint-nodeports + add rule ip kube-proxy filter-forward ct state new jump endpoints-check add rule ip kube-proxy filter-forward jump forward add rule ip kube-proxy filter-forward ct state new jump firewall-check - add rule ip kube-proxy filter-input ct state new jump external-services + add rule ip kube-proxy filter-input ct state new jump endpoints-check add rule ip kube-proxy filter-input ct state new jump firewall-check - add rule ip kube-proxy filter-output ct state new jump services-filter + add rule ip kube-proxy filter-output ct state new jump endpoints-check add rule ip kube-proxy filter-output ct state new jump firewall-check add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return add rule ip kube-proxy firewall-allow-check drop @@ -4281,9 +4290,12 @@ func TestSyncProxyRulesRepeated(t *testing.T) { add rule ip kube-proxy nat-output jump services add rule ip kube-proxy nat-postrouting jump masquerading add rule ip kube-proxy nat-prerouting jump services + add rule ip kube-proxy reject-chain reject add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; } add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; } + add map ip kube-proxy no-endpoint-nodeports { type inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to service nodeports with no endpoints" ; } + add map ip kube-proxy no-endpoint-services { type ipv4_addr . inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to services with no endpoints" ; } `) // Helper function to make it look like time has passed (from the point of view of @@ -4508,7 +4520,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) { add rule ip kube-proxy endpoint-2OCDJSZQ-ns3/svc3/tcp/p80__10.0.3.1/80 ip saddr 10.0.3.1 jump mark-for-masquerade add rule ip kube-proxy endpoint-2OCDJSZQ-ns3/svc3/tcp/p80__10.0.3.1/80 meta l4proto tcp dnat to 10.0.3.1:80 - add rule ip kube-proxy services-filter ip daddr 172.30.0.44 tcp dport 80 reject comment "ns4/svc4:p80 has no endpoints" + add element ip kube-proxy no-endpoint-services { 172.30.0.44 . tcp . 80 comment "ns4/svc4:p80" : goto reject-chain } `) assertNFTablesTransactionEqual(t, getLine(), expected, nft.Dump())