Merge pull request #56164 from danwinship/proxier-chain-split

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Split KUBE-SERVICES chain to re-shrink the INPUT chain

**What this PR does / why we need it**:
#43972 added an iptables rule "`-A INPUT -j KUBE-SERVICES`" to make NodePort ICMP rejection work. (Previously the KUBE-SERVICES chain was only run from OUTPUT, not INPUT.) #44547 extended that patch for ExternalIP rejection as well.

However, the KUBE-SERVICES chain may potentially have a very large number of ICMP reject rules for plain ClusterIP services (the ones that get run from OUTPUT), and it seems that for some reason the kernel is much more sensitive to the length of the INPUT chain than it is to the length of the OUTPUT chain. So a node that worked fine with kube 1.6 (when KUBE-SERVICES was only run from OUTPUT) might fall over with kube 1.7 (with KUBE-SERVICES being run from both INPUT and OUTPUT).

(Specifically, a node with about 5000 ClusterIP reject rules that ran fine with OpenShift 3.6 [kube 1.6] slowed almost to a complete halt with OpenShift 3.7 [kube 1.7].)

This PR fixes things by splitting out the "new" part of KUBE-SERVICES (NodePort and ExternalIP reject rules) into a separate KUBE-EXTERNAL-SERVICES chain run from INPUT, and moves KUBE-SERVICES back to being only run from OUTPUT. (So, yes, this assumes that you don't have 5000 NodePort/ExternalIP services, but, if you do, there's not much we can do, since those rules *have* to be run on the INPUT side.)

Oh, and I left in the code to clean up the "`-A INPUT -j KUBE-SERVICES`" rule even though we don't generate it any more, so it gets fixed on upgrade.

**Release note**:
```release-note
Reorganized iptables rules to fix a performance regression on clusters with thousands of services.
```

@kubernetes/sig-network-bugs @kubernetes/rh-networking
This commit is contained in:
Kubernetes Submit Queue 2018-02-22 18:52:53 -08:00 committed by GitHub
commit f0ca996274
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 8 deletions

View File

@ -65,6 +65,9 @@ const (
// the services chain
kubeServicesChain utiliptables.Chain = "KUBE-SERVICES"
// the external services chain
kubeExternalServicesChain utiliptables.Chain = "KUBE-EXTERNAL-SERVICES"
// the nodeports chain
kubeNodePortsChain utiliptables.Chain = "KUBE-NODEPORTS"
@ -433,7 +436,7 @@ type iptablesJumpChain struct {
}
var iptablesJumpChains = []iptablesJumpChain{
{utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainInput, "kubernetes service portals"},
{utiliptables.TableFilter, kubeExternalServicesChain, utiliptables.ChainInput, "kubernetes externally-visible service portals"},
{utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainOutput, "kubernetes service portals"},
{utiliptables.TableNAT, kubeServicesChain, utiliptables.ChainOutput, "kubernetes service portals"},
{utiliptables.TableNAT, kubeServicesChain, utiliptables.ChainPrerouting, "kubernetes service portals"},
@ -441,11 +444,16 @@ var iptablesJumpChains = []iptablesJumpChain{
{utiliptables.TableFilter, kubeForwardChain, utiliptables.ChainForward, "kubernetes forwarding rules"},
}
var iptablesCleanupOnlyChains = []iptablesJumpChain{
// Present in kube 1.6 - 1.9. Removed by #56164 in favor of kubeExternalServicesChain
{utiliptables.TableFilter, kubeServicesChain, utiliptables.ChainInput, "kubernetes service portals"},
}
// CleanupLeftovers removes all iptables rules and chains created by the Proxier
// It returns true if an error was encountered. Errors are logged.
func CleanupLeftovers(ipt utiliptables.Interface) (encounteredError bool) {
// Unlink our chains
for _, chain := range iptablesJumpChains {
for _, chain := range append(iptablesJumpChains, iptablesCleanupOnlyChains...) {
args := []string{
"-m", "comment", "--comment", chain.comment,
"-j", string(chain.chain),
@ -504,7 +512,7 @@ func CleanupLeftovers(ipt utiliptables.Interface) (encounteredError bool) {
filterChains := bytes.NewBuffer(nil)
filterRules := bytes.NewBuffer(nil)
writeLine(filterChains, "*filter")
for _, chain := range []utiliptables.Chain{kubeServicesChain, kubeForwardChain} {
for _, chain := range []utiliptables.Chain{kubeServicesChain, kubeExternalServicesChain, kubeForwardChain} {
if _, found := existingFilterChains[chain]; found {
chainString := string(chain)
writeLine(filterChains, existingFilterChains[chain])
@ -770,7 +778,7 @@ func (proxier *Proxier) syncProxyRules() {
// Make sure we keep stats for the top-level chains, if they existed
// (which most should have because we created them above).
for _, chainName := range []utiliptables.Chain{kubeServicesChain, kubeForwardChain} {
for _, chainName := range []utiliptables.Chain{kubeServicesChain, kubeExternalServicesChain, kubeForwardChain} {
if chain, ok := existingFilterChains[chainName]; ok {
writeLine(proxier.filterChains, chain)
} else {
@ -941,7 +949,7 @@ func (proxier *Proxier) syncProxyRules() {
// Install ICMP Reject rule in filter table for destination=externalIP and dport=svcport
if len(proxier.endpointsMap[svcName]) == 0 {
writeLine(proxier.filterRules,
"-A", string(kubeServicesChain),
"-A", string(kubeExternalServicesChain),
"-m", "comment", "--comment", fmt.Sprintf(`"%s has no endpoints"`, svcNameString),
"-m", protocol, "-p", protocol,
"-d", utilproxy.ToCIDR(net.ParseIP(externalIP)),
@ -1082,7 +1090,7 @@ func (proxier *Proxier) syncProxyRules() {
// chain.
if len(proxier.endpointsMap[svcName]) == 0 {
writeLine(proxier.filterRules,
"-A", string(kubeServicesChain),
"-A", string(kubeExternalServicesChain),
"-m", "comment", "--comment", fmt.Sprintf(`"%s has no endpoints"`, svcNameString),
"-m", "addrtype", "--dst-type", "LOCAL",
"-m", protocol, "-p", protocol,

View File

@ -807,7 +807,7 @@ func TestExternalIPsReject(t *testing.T) {
fp.syncProxyRules()
kubeSvcRules := ipt.GetRules(string(kubeServicesChain))
kubeSvcRules := ipt.GetRules(string(kubeExternalServicesChain))
if !hasJump(kubeSvcRules, iptablestest.Reject, svcExternalIPs, svcPort) {
errorf(fmt.Sprintf("Failed to a %v rule for externalIP %v with no endpoints", iptablestest.Reject, svcPortName), kubeSvcRules, t)
}
@ -840,7 +840,7 @@ func TestNodePortReject(t *testing.T) {
fp.syncProxyRules()
kubeSvcRules := ipt.GetRules(string(kubeServicesChain))
kubeSvcRules := ipt.GetRules(string(kubeExternalServicesChain))
if !hasJump(kubeSvcRules, iptablestest.Reject, svcIP, svcNodePort) {
errorf(fmt.Sprintf("Failed to find a %v rule for service %v with no endpoints", iptablestest.Reject, svcPortName), kubeSvcRules, t)
}