mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-01-06 07:57:35 +00:00
use ipset doing snat and packet filter in ipvs proxy
This commit is contained in:
@@ -49,6 +49,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/proxy/metrics"
|
||||
utilproxy "k8s.io/kubernetes/pkg/proxy/util"
|
||||
"k8s.io/kubernetes/pkg/util/async"
|
||||
utilipset "k8s.io/kubernetes/pkg/util/ipset"
|
||||
utiliptables "k8s.io/kubernetes/pkg/util/iptables"
|
||||
utilipvs "k8s.io/kubernetes/pkg/util/ipvs"
|
||||
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
|
||||
@@ -59,6 +60,12 @@ const (
|
||||
// kubeServicesChain is the services portal chain
|
||||
kubeServicesChain utiliptables.Chain = "KUBE-SERVICES"
|
||||
|
||||
// KubeServiceIPSetsChain is the services access IP chain
|
||||
KubeServiceIPSetsChain utiliptables.Chain = "KUBE-SVC-IPSETS"
|
||||
|
||||
// KubeFireWallChain is the kubernetes firewall chain.
|
||||
KubeFireWallChain utiliptables.Chain = "KUBE-FIRE-WALL"
|
||||
|
||||
// kubePostroutingChain is the kubernetes postrouting chain
|
||||
kubePostroutingChain utiliptables.Chain = "KUBE-POSTROUTING"
|
||||
|
||||
@@ -67,11 +74,10 @@ const (
|
||||
|
||||
// KubeMarkDropChain is the mark-for-drop chain
|
||||
KubeMarkDropChain utiliptables.Chain = "KUBE-MARK-DROP"
|
||||
)
|
||||
|
||||
const (
|
||||
// DefaultScheduler is the default ipvs scheduler algorithm - round robin.
|
||||
DefaultScheduler = "rr"
|
||||
|
||||
// DefaultDummyDevice is the default dummy interface where ipvs service address will bind to it.
|
||||
DefaultDummyDevice = "kube-ipvs0"
|
||||
)
|
||||
@@ -117,6 +123,7 @@ type Proxier struct {
|
||||
minSyncPeriod time.Duration
|
||||
iptables utiliptables.Interface
|
||||
ipvs utilipvs.Interface
|
||||
ipset utilipset.Interface
|
||||
exec utilexec.Interface
|
||||
masqueradeAll bool
|
||||
masqueradeMark string
|
||||
@@ -137,6 +144,26 @@ type Proxier struct {
|
||||
natRules *bytes.Buffer
|
||||
// Added as a member to the struct to allow injection for testing.
|
||||
netlinkHandle NetLinkHandle
|
||||
// loopbackSet is the ipset where stores all endpoints IP:Port,IP for solving hairpin mode purpose.
|
||||
loopbackSet *IPSet
|
||||
// clusterIPSet is the ipset where stores all service ClusterIP:Port
|
||||
clusterIPSet *IPSet
|
||||
// nodePortSetTCP is the bitmap:port type ipset where stores all TCP node port
|
||||
nodePortSetTCP *IPSet
|
||||
// nodePortSetTCP is the bitmap:port type ipset where stores all UDP node port
|
||||
nodePortSetUDP *IPSet
|
||||
// externalIPSet is the hash:ip,port type ipset where stores all service ExternalIP:Port
|
||||
externalIPSet *IPSet
|
||||
// lbIngressSet is the hash:ip,port type ipset where stores all service load balancer ingress IP:Port.
|
||||
lbIngressSet *IPSet
|
||||
// lbMasqSet is the hash:ip,port type ipset where stores all service load balancer ingress IP:Port which needs masquerade.
|
||||
lbMasqSet *IPSet
|
||||
// lbWhiteListIPSet is the hash:ip,port,ip type ipset where stores all service load balancer ingress IP:Port,sourceIP pair, any packets
|
||||
// with the source IP visit ingress IP:Port can pass through.
|
||||
lbWhiteListIPSet *IPSet
|
||||
// lbWhiteListIPSet is the hash:ip,port,net type ipset where stores all service load balancer ingress IP:Port,sourceCIDR pair, any packets
|
||||
// from the source CIDR visit ingress IP:Port can pass through.
|
||||
lbWhiteListCIDRSet *IPSet
|
||||
}
|
||||
|
||||
// IPGetter helps get node network interface IP
|
||||
@@ -184,7 +211,9 @@ var _ proxy.ProxyProvider = &Proxier{}
|
||||
// An error will be returned if it fails to update or acquire the initial lock.
|
||||
// Once a proxier is created, it will keep iptables and ipvs rules up to date in the background and
|
||||
// will not terminate if a particular iptables or ipvs call fails.
|
||||
func NewProxier(ipt utiliptables.Interface, ipvs utilipvs.Interface,
|
||||
func NewProxier(ipt utiliptables.Interface,
|
||||
ipvs utilipvs.Interface,
|
||||
ipset utilipset.Interface,
|
||||
sysctl utilsysctl.Interface,
|
||||
exec utilexec.Interface,
|
||||
syncPeriod time.Duration,
|
||||
@@ -248,32 +277,46 @@ func NewProxier(ipt utiliptables.Interface, ipvs utilipvs.Interface,
|
||||
|
||||
healthChecker := healthcheck.NewServer(hostname, recorder, nil, nil) // use default implementations of deps
|
||||
|
||||
isIPv6 := utilproxy.IsIPv6(nodeIP)
|
||||
|
||||
glog.V(2).Infof("nodeIP: %v, isIPv6: %v", nodeIP, isIPv6)
|
||||
|
||||
proxier := &Proxier{
|
||||
portsMap: make(map[utilproxy.LocalPort]utilproxy.Closeable),
|
||||
serviceMap: make(proxyServiceMap),
|
||||
serviceChanges: newServiceChangeMap(),
|
||||
endpointsMap: make(proxyEndpointsMap),
|
||||
endpointsChanges: newEndpointsChangeMap(hostname),
|
||||
syncPeriod: syncPeriod,
|
||||
minSyncPeriod: minSyncPeriod,
|
||||
iptables: ipt,
|
||||
masqueradeAll: masqueradeAll,
|
||||
masqueradeMark: masqueradeMark,
|
||||
exec: exec,
|
||||
clusterCIDR: clusterCIDR,
|
||||
hostname: hostname,
|
||||
nodeIP: nodeIP,
|
||||
portMapper: &listenPortOpener{},
|
||||
recorder: recorder,
|
||||
healthChecker: healthChecker,
|
||||
healthzServer: healthzServer,
|
||||
ipvs: ipvs,
|
||||
ipvsScheduler: scheduler,
|
||||
ipGetter: &realIPGetter{},
|
||||
iptablesData: bytes.NewBuffer(nil),
|
||||
natChains: bytes.NewBuffer(nil),
|
||||
natRules: bytes.NewBuffer(nil),
|
||||
netlinkHandle: NewNetLinkHandle(),
|
||||
portsMap: make(map[utilproxy.LocalPort]utilproxy.Closeable),
|
||||
serviceMap: make(proxyServiceMap),
|
||||
serviceChanges: newServiceChangeMap(),
|
||||
endpointsMap: make(proxyEndpointsMap),
|
||||
endpointsChanges: newEndpointsChangeMap(hostname),
|
||||
syncPeriod: syncPeriod,
|
||||
minSyncPeriod: minSyncPeriod,
|
||||
iptables: ipt,
|
||||
masqueradeAll: masqueradeAll,
|
||||
masqueradeMark: masqueradeMark,
|
||||
exec: exec,
|
||||
clusterCIDR: clusterCIDR,
|
||||
hostname: hostname,
|
||||
nodeIP: nodeIP,
|
||||
portMapper: &listenPortOpener{},
|
||||
recorder: recorder,
|
||||
healthChecker: healthChecker,
|
||||
healthzServer: healthzServer,
|
||||
ipvs: ipvs,
|
||||
ipvsScheduler: scheduler,
|
||||
ipGetter: &realIPGetter{},
|
||||
iptablesData: bytes.NewBuffer(nil),
|
||||
natChains: bytes.NewBuffer(nil),
|
||||
natRules: bytes.NewBuffer(nil),
|
||||
netlinkHandle: NewNetLinkHandle(),
|
||||
ipset: ipset,
|
||||
loopbackSet: NewIPSet(ipset, KubeLoopBackIPSet, utilipset.HashIPPortIP, isIPv6),
|
||||
clusterIPSet: NewIPSet(ipset, KubeClusterIPSet, utilipset.HashIPPort, isIPv6),
|
||||
externalIPSet: NewIPSet(ipset, KubeExternalIPSet, utilipset.HashIPPort, isIPv6),
|
||||
lbIngressSet: NewIPSet(ipset, KubeLoadBalancerSet, utilipset.HashIPPort, isIPv6),
|
||||
lbMasqSet: NewIPSet(ipset, KubeLoadBalancerMasqSet, utilipset.HashIPPort, isIPv6),
|
||||
lbWhiteListIPSet: NewIPSet(ipset, KubeLoadBalancerSourceIPSet, utilipset.HashIPPortIP, isIPv6),
|
||||
lbWhiteListCIDRSet: NewIPSet(ipset, KubeLoadBalancerSourceCIDRSet, utilipset.HashIPPortNet, isIPv6),
|
||||
nodePortSetTCP: NewIPSet(ipset, KubeNodePortSetTCP, utilipset.BitmapPort, false),
|
||||
nodePortSetUDP: NewIPSet(ipset, KubeNodePortSetUDP, utilipset.BitmapPort, false),
|
||||
}
|
||||
burstSyncs := 2
|
||||
glog.V(3).Infof("minSyncPeriod: %v, syncPeriod: %v, burstSyncs: %d", minSyncPeriod, syncPeriod, burstSyncs)
|
||||
@@ -485,6 +528,11 @@ func (e *endpointsInfo) IPPart() string {
|
||||
return utilproxy.IPPart(e.endpoint)
|
||||
}
|
||||
|
||||
// PortPart returns just the Port part of the endpoint.
|
||||
func (e *endpointsInfo) PortPart() (int, error) {
|
||||
return utilproxy.PortPart(e.endpoint)
|
||||
}
|
||||
|
||||
type endpointServicePair struct {
|
||||
endpoint string
|
||||
servicePortName proxy.ServicePortName
|
||||
@@ -652,7 +700,7 @@ func (em proxyEndpointsMap) unmerge(other proxyEndpointsMap) {
|
||||
// This is determined by checking if all the required kernel modules can be loaded. It may
|
||||
// return an error if it fails to get the kernel modules information without error, in which
|
||||
// case it will also return false.
|
||||
func CanUseIPVSProxier() (bool, error) {
|
||||
func CanUseIPVSProxier(ipsetver IPSetVersioner) (bool, error) {
|
||||
// Try to load IPVS required kernel modules using modprobe
|
||||
for _, kmod := range ipvsModules {
|
||||
err := utilexec.New().Command("modprobe", "--", kmod).Run()
|
||||
@@ -677,6 +725,15 @@ func CanUseIPVSProxier() (bool, error) {
|
||||
if len(modules) != 0 {
|
||||
return false, fmt.Errorf("IPVS proxier will not be used because the following required kernel modules are not loaded: %v", modules)
|
||||
}
|
||||
|
||||
// Check ipset version
|
||||
versionString, err := ipsetver.GetVersion()
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("error getting ipset version, error: %v", err)
|
||||
}
|
||||
if !checkMinVersion(versionString) {
|
||||
return false, fmt.Errorf("ipset version: %s is less than min required version: %s", versionString, MinIPSetCheckVersion)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
@@ -728,7 +785,7 @@ func cleanupIptablesLeftovers(ipt utiliptables.Interface) (encounteredError bool
|
||||
natRules := bytes.NewBuffer(nil)
|
||||
writeLine(natChains, "*nat")
|
||||
// Start with chains we know we need to remove.
|
||||
for _, chain := range []utiliptables.Chain{kubeServicesChain, kubePostroutingChain, KubeMarkMasqChain} {
|
||||
for _, chain := range []utiliptables.Chain{kubeServicesChain, kubePostroutingChain, KubeMarkMasqChain, KubeServiceIPSetsChain} {
|
||||
if _, found := existingNATChains[chain]; found {
|
||||
chainString := string(chain)
|
||||
writeLine(natChains, existingNATChains[chain]) // flush
|
||||
@@ -748,7 +805,7 @@ func cleanupIptablesLeftovers(ipt utiliptables.Interface) (encounteredError bool
|
||||
}
|
||||
|
||||
// CleanupLeftovers clean up all ipvs and iptables rules created by ipvs Proxier.
|
||||
func CleanupLeftovers(ipvs utilipvs.Interface, ipt utiliptables.Interface) (encounteredError bool) {
|
||||
func CleanupLeftovers(ipvs utilipvs.Interface, ipt utiliptables.Interface, ipset utilipset.Interface) (encounteredError bool) {
|
||||
// Return immediately when ipvs interface is nil - Probably initialization failed in somewhere.
|
||||
if ipvs == nil {
|
||||
return true
|
||||
@@ -768,6 +825,16 @@ func CleanupLeftovers(ipvs utilipvs.Interface, ipt utiliptables.Interface) (enco
|
||||
}
|
||||
// Clear iptables created by ipvs Proxier.
|
||||
encounteredError = cleanupIptablesLeftovers(ipt) || encounteredError
|
||||
// Destroy ip sets created by ipvs Proxier. We should call it after cleaning up
|
||||
// iptables since we can NOT delete ip set which is still referenced by iptables.
|
||||
ipSetsToDestroy := []string{KubeLoopBackIPSet, KubeClusterIPSet, KubeLoadBalancerSet, KubeNodePortSetTCP, KubeNodePortSetUDP,
|
||||
KubeExternalIPSet, KubeLoadBalancerSourceIPSet, KubeLoadBalancerSourceCIDRSet, KubeLoadBalancerMasqSet}
|
||||
for _, set := range ipSetsToDestroy {
|
||||
err = ipset.DestroySet(set)
|
||||
if err != nil {
|
||||
encounteredError = true
|
||||
}
|
||||
}
|
||||
return encounteredError
|
||||
}
|
||||
|
||||
@@ -957,6 +1024,16 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
return
|
||||
}
|
||||
|
||||
// make sure ip sets exists in the system.
|
||||
ipSets := []*IPSet{proxier.loopbackSet, proxier.clusterIPSet, proxier.externalIPSet, proxier.nodePortSetUDP, proxier.nodePortSetTCP,
|
||||
proxier.lbIngressSet, proxier.lbMasqSet, proxier.lbWhiteListCIDRSet, proxier.lbWhiteListIPSet}
|
||||
if err := ensureIPSets(ipSets...); err != nil {
|
||||
return
|
||||
}
|
||||
for i := range ipSets {
|
||||
ipSets[i].resetEntries()
|
||||
}
|
||||
|
||||
// Accumulate the set of local ports that we will be holding open once this update is complete
|
||||
replacementPortsMap := map[utilproxy.LocalPort]utilproxy.Closeable{}
|
||||
// activeIPVSServices represents IPVS service successfully created in this round of sync
|
||||
@@ -976,6 +1053,17 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
// is just for efficiency, not correctness.
|
||||
args := make([]string, 64)
|
||||
|
||||
// Kube service portal
|
||||
if err := proxier.linkKubeServiceChain(existingNATChains, proxier.natChains); err != nil {
|
||||
glog.Errorf("Failed to link KUBE-SERVICES chain: %v", err)
|
||||
return
|
||||
}
|
||||
// Kube service ipset
|
||||
if err := proxier.createKubeFireWallChain(existingNATChains, proxier.natChains); err != nil {
|
||||
glog.Errorf("Failed to create KUBE-FIRE-WALL chain: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Build IPVS rules for each service.
|
||||
for svcName, svcInfo := range proxier.serviceMap {
|
||||
protocol := strings.ToLower(string(svcInfo.protocol))
|
||||
@@ -983,7 +1071,41 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
// to ServicePortName.String() show up in CPU profiles.
|
||||
svcNameString := svcName.String()
|
||||
|
||||
// Handle traffic that loops back to the originator with SNAT.
|
||||
for _, ep := range proxier.endpointsMap[svcName] {
|
||||
epIP := ep.IPPart()
|
||||
epPort, err := ep.PortPart()
|
||||
// Error parsing this endpoint has been logged. Skip to next endpoint.
|
||||
if epIP == "" || err != nil {
|
||||
continue
|
||||
}
|
||||
entry := &utilipset.Entry{
|
||||
IP: epIP,
|
||||
Port: epPort,
|
||||
Protocol: protocol,
|
||||
IP2: epIP,
|
||||
SetType: utilipset.HashIPPortIP,
|
||||
}
|
||||
proxier.loopbackSet.activeEntries.Insert(entry.String())
|
||||
}
|
||||
|
||||
// Capture the clusterIP.
|
||||
// ipset call
|
||||
entry := &utilipset.Entry{
|
||||
IP: svcInfo.clusterIP.String(),
|
||||
Port: svcInfo.port,
|
||||
Protocol: protocol,
|
||||
SetType: utilipset.HashIPPort,
|
||||
}
|
||||
// add service Cluster IP:Port to kubeServiceAccess ip set for the purpose of solving hairpin.
|
||||
// proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String())
|
||||
// Install masquerade rules if 'masqueradeAll' or 'clusterCIDR' is specified.
|
||||
if proxier.masqueradeAll {
|
||||
proxier.clusterIPSet.activeEntries.Insert(entry.String())
|
||||
} else if len(proxier.clusterCIDR) > 0 {
|
||||
proxier.clusterIPSet.activeEntries.Insert(entry.String())
|
||||
}
|
||||
// ipvs call
|
||||
serv := &utilipvs.VirtualServer{
|
||||
Address: svcInfo.clusterIP,
|
||||
Port: uint16(svcInfo.port),
|
||||
@@ -1004,32 +1126,6 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
} else {
|
||||
glog.Errorf("Failed to sync service: %v, err: %v", serv, err)
|
||||
}
|
||||
// Install masquerade rules if 'masqueradeAll' or 'clusterCIDR' is specified.
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "comment", "--comment", fmt.Sprintf(`"%s cluster IP"`, svcNameString),
|
||||
"-m", protocol, "-p", protocol,
|
||||
"-d", utilproxy.ToCIDR(svcInfo.clusterIP),
|
||||
"--dport", strconv.Itoa(svcInfo.port),
|
||||
)
|
||||
if proxier.masqueradeAll {
|
||||
err = proxier.linkKubeServiceChain(existingNATChains, proxier.natChains)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to link KUBE-SERVICES chain: %v", err)
|
||||
}
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
||||
} else if len(proxier.clusterCIDR) > 0 {
|
||||
// This masquerades off-cluster traffic to a service VIP. The idea
|
||||
// is that you can establish a static route for your Service range,
|
||||
// routing to any node, and that node will bridge into the Service
|
||||
// for you. Since that might bounce off-node, we masquerade here.
|
||||
// If/when we support "Local" policy for VIPs, we should update this.
|
||||
err = proxier.linkKubeServiceChain(existingNATChains, proxier.natChains)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to link KUBE-SERVICES chain: %v", err)
|
||||
}
|
||||
writeLine(proxier.natRules, append(args, "! -s", proxier.clusterCIDR, "-j", string(KubeMarkMasqChain))...)
|
||||
}
|
||||
|
||||
// Capture externalIPs.
|
||||
for _, externalIP := range svcInfo.externalIPs {
|
||||
@@ -1064,6 +1160,17 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
}
|
||||
} // We're holding the port, so it's OK to install IPVS rules.
|
||||
|
||||
// ipset call
|
||||
entry := &utilipset.Entry{
|
||||
IP: externalIP,
|
||||
Port: svcInfo.port,
|
||||
Protocol: protocol,
|
||||
SetType: utilipset.HashIPPort,
|
||||
}
|
||||
// We have to SNAT packets to external IPs.
|
||||
proxier.externalIPSet.activeEntries.Insert(entry.String())
|
||||
|
||||
// ipvs call
|
||||
serv := &utilipvs.VirtualServer{
|
||||
Address: net.ParseIP(externalIP),
|
||||
Port: uint16(svcInfo.port),
|
||||
@@ -1088,25 +1195,39 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
// Capture load-balancer ingress.
|
||||
for _, ingress := range svcInfo.loadBalancerStatus.Ingress {
|
||||
if ingress.IP != "" {
|
||||
// ipset call
|
||||
entry = &utilipset.Entry{
|
||||
IP: ingress.IP,
|
||||
Port: svcInfo.port,
|
||||
Protocol: protocol,
|
||||
SetType: utilipset.HashIPPort,
|
||||
}
|
||||
// add service load balancer ingressIP:Port to kubeServiceAccess ip set for the purpose of solving hairpin.
|
||||
// proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String())
|
||||
// If we are proxying globally, we need to masquerade in case we cross nodes.
|
||||
// If we are proxying only locally, we can retain the source IP.
|
||||
if !svcInfo.onlyNodeLocalEndpoints {
|
||||
proxier.lbMasqSet.activeEntries.Insert(entry.String())
|
||||
}
|
||||
if len(svcInfo.loadBalancerSourceRanges) != 0 {
|
||||
err = proxier.linkKubeServiceChain(existingNATChains, proxier.natChains)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to link KUBE-SERVICES chain: %v", err)
|
||||
}
|
||||
// The service firewall rules are created based on ServiceSpec.loadBalancerSourceRanges field.
|
||||
// This currently works for loadbalancers that preserves source ips.
|
||||
// For loadbalancers which direct traffic to service NodePort, the firewall rules will not apply.
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "comment", "--comment", fmt.Sprintf(`"%s loadbalancer IP"`, svcNameString),
|
||||
"-m", string(svcInfo.protocol), "-p", string(svcInfo.protocol),
|
||||
"-d", utilproxy.ToCIDR(net.ParseIP(ingress.IP)),
|
||||
"--dport", fmt.Sprintf("%d", svcInfo.port),
|
||||
)
|
||||
proxier.lbIngressSet.activeEntries.Insert(entry.String())
|
||||
|
||||
allowFromNode := false
|
||||
for _, src := range svcInfo.loadBalancerSourceRanges {
|
||||
writeLine(proxier.natRules, append(args, "-s", src, "-j", "ACCEPT")...)
|
||||
// ipset call
|
||||
entry = &utilipset.Entry{
|
||||
IP: ingress.IP,
|
||||
Port: svcInfo.port,
|
||||
Protocol: protocol,
|
||||
Net: src,
|
||||
SetType: utilipset.HashIPPortNet,
|
||||
}
|
||||
// enumerate all white list source cidr
|
||||
proxier.lbWhiteListCIDRSet.activeEntries.Insert(entry.String())
|
||||
|
||||
// ignore error because it has been validated
|
||||
_, cidr, _ := net.ParseCIDR(src)
|
||||
if cidr.Contains(proxier.nodeIP) {
|
||||
@@ -1117,14 +1238,19 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
// loadbalancer's backend hosts. In this case, request will not hit the loadbalancer but loop back directly.
|
||||
// Need to add the following rule to allow request on host.
|
||||
if allowFromNode {
|
||||
writeLine(proxier.natRules, append(args, "-s", utilproxy.ToCIDR(net.ParseIP(ingress.IP)), "-j", "ACCEPT")...)
|
||||
entry = &utilipset.Entry{
|
||||
IP: ingress.IP,
|
||||
Port: svcInfo.port,
|
||||
Protocol: protocol,
|
||||
IP2: ingress.IP,
|
||||
SetType: utilipset.HashIPPortIP,
|
||||
}
|
||||
// enumerate all white list source ip
|
||||
proxier.lbWhiteListIPSet.activeEntries.Insert(entry.String())
|
||||
}
|
||||
|
||||
// If the packet was able to reach the end of firewall chain, then it did not get DNATed.
|
||||
// It means the packet cannot go through the firewall, then DROP it.
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkDropChain))...)
|
||||
}
|
||||
|
||||
// ipvs call
|
||||
serv := &utilipvs.VirtualServer{
|
||||
Address: net.ParseIP(ingress.IP),
|
||||
Port: uint16(svcInfo.port),
|
||||
@@ -1170,12 +1296,33 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
replacementPortsMap[lp] = socket
|
||||
} // We're holding the port, so it's OK to install ipvs rules.
|
||||
|
||||
// Nodeports need SNAT, unless they're local.
|
||||
// ipset call
|
||||
if !svcInfo.onlyNodeLocalEndpoints {
|
||||
entry = &utilipset.Entry{
|
||||
// No need to provide ip info
|
||||
Port: svcInfo.nodePort,
|
||||
Protocol: protocol,
|
||||
SetType: utilipset.BitmapPort,
|
||||
}
|
||||
switch protocol {
|
||||
case "tcp":
|
||||
proxier.nodePortSetTCP.activeEntries.Insert(entry.String())
|
||||
case "udp":
|
||||
proxier.nodePortSetUDP.activeEntries.Insert(entry.String())
|
||||
default:
|
||||
// It should never hit
|
||||
glog.Errorf("Unsupported protocol type: %s", protocol)
|
||||
}
|
||||
}
|
||||
|
||||
// Build ipvs kernel routes for each node ip address
|
||||
nodeIPs, err := proxier.ipGetter.NodeIPs()
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to get node IP, err: %v", err)
|
||||
} else {
|
||||
for _, nodeIP := range nodeIPs {
|
||||
// ipvs call
|
||||
serv := &utilipvs.VirtualServer{
|
||||
Address: nodeIP,
|
||||
Port: uint16(svcInfo.nodePort),
|
||||
@@ -1200,6 +1347,119 @@ func (proxier *Proxier) syncProxyRules() {
|
||||
}
|
||||
}
|
||||
|
||||
// sync ipset entries
|
||||
ipsetsToSync := []*IPSet{proxier.loopbackSet, proxier.clusterIPSet, proxier.lbIngressSet, proxier.lbMasqSet, proxier.nodePortSetTCP,
|
||||
proxier.nodePortSetUDP, proxier.externalIPSet, proxier.lbWhiteListIPSet, proxier.lbWhiteListCIDRSet}
|
||||
for i := range ipsetsToSync {
|
||||
ipsetsToSync[i].syncIPSetEntries()
|
||||
}
|
||||
|
||||
// Tail call iptables rules for ipset, make sure only call iptables once
|
||||
// in a single loop per ip set.
|
||||
if !proxier.loopbackSet.isEmpty() {
|
||||
args = append(args[:0],
|
||||
"-A", string(kubePostroutingChain),
|
||||
"-m", "set", "--match-set", proxier.loopbackSet.Name,
|
||||
"dst,dst,src",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", "MASQUERADE")...)
|
||||
}
|
||||
if !proxier.clusterIPSet.isEmpty() {
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "set", "--match-set", proxier.clusterIPSet.Name,
|
||||
"dst,dst",
|
||||
)
|
||||
if proxier.masqueradeAll {
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
||||
} else if len(proxier.clusterCIDR) > 0 {
|
||||
// This masquerades off-cluster traffic to a service VIP. The idea
|
||||
// is that you can establish a static route for your Service range,
|
||||
// routing to any node, and that node will bridge into the Service
|
||||
// for you. Since that might bounce off-node, we masquerade here.
|
||||
// If/when we support "Local" policy for VIPs, we should update this.
|
||||
writeLine(proxier.natRules, append(args, "! -s", proxier.clusterCIDR, "-j", string(KubeMarkMasqChain))...)
|
||||
}
|
||||
}
|
||||
if !proxier.externalIPSet.isEmpty() {
|
||||
// Build masquerade rules for packets to external IPs.
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "set", "--match-set", proxier.externalIPSet.Name,
|
||||
"dst,dst",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
||||
// Allow traffic for external IPs that does not come from a bridge (i.e. not from a container)
|
||||
// nor from a local process to be forwarded to the service.
|
||||
// This rule roughly translates to "all traffic from off-machine".
|
||||
// This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later.
|
||||
externalTrafficOnlyArgs := append(args,
|
||||
"-m", "physdev", "!", "--physdev-is-in",
|
||||
"-m", "addrtype", "!", "--src-type", "LOCAL")
|
||||
writeLine(proxier.natRules, append(externalTrafficOnlyArgs, "-j", "ACCEPT")...)
|
||||
dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL")
|
||||
// Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local.
|
||||
// This covers cases like GCE load-balancers which get added to the local routing table.
|
||||
writeLine(proxier.natRules, append(dstLocalOnlyArgs, "-j", "ACCEPT")...)
|
||||
}
|
||||
if !proxier.lbMasqSet.isEmpty() {
|
||||
// Build masquerade rules for packets which cross node visit load balancer ingress IPs.
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "set", "--match-set", proxier.lbMasqSet.Name,
|
||||
"dst,dst",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
||||
}
|
||||
if !proxier.lbWhiteListCIDRSet.isEmpty() || !proxier.lbWhiteListIPSet.isEmpty() {
|
||||
// link kube-services chain -> kube-fire-wall chain
|
||||
args := []string{"-m", "set", "--match-set", proxier.lbIngressSet.Name, "dst,dst", "-j", string(KubeFireWallChain)}
|
||||
if _, err := proxier.iptables.EnsureRule(utiliptables.Append, utiliptables.TableNAT, kubeServicesChain, args...); err != nil {
|
||||
glog.Errorf("Failed to ensure that ipset %s chain %s jumps to %s: %v", proxier.lbIngressSet.Name, kubeServicesChain, KubeFireWallChain, err)
|
||||
}
|
||||
if !proxier.lbWhiteListCIDRSet.isEmpty() {
|
||||
args = append(args[:0],
|
||||
"-A", string(KubeFireWallChain),
|
||||
"-m", "set", "--match-set", proxier.lbWhiteListCIDRSet.Name,
|
||||
"dst,dst,src",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", "ACCEPT")...)
|
||||
}
|
||||
if !proxier.lbWhiteListIPSet.isEmpty() {
|
||||
args = append(args[:0],
|
||||
"-A", string(KubeFireWallChain),
|
||||
"-m", "set", "--match-set", proxier.lbWhiteListIPSet.Name,
|
||||
"dst,dst,src",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", "ACCEPT")...)
|
||||
}
|
||||
args = append(args[:0],
|
||||
"-A", string(KubeFireWallChain),
|
||||
)
|
||||
// If the packet was able to reach the end of firewall chain, then it did not get DNATed.
|
||||
// It means the packet cannot go thru the firewall, then mark it for DROP
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkDropChain))...)
|
||||
}
|
||||
if !proxier.nodePortSetTCP.isEmpty() {
|
||||
// Build masquerade rules for packets which cross node visit nodeport.
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "tcp", "-p", "tcp",
|
||||
"-m", "set", "--match-set", proxier.nodePortSetTCP.Name,
|
||||
"dst",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
||||
}
|
||||
if !proxier.nodePortSetUDP.isEmpty() {
|
||||
args = append(args[:0],
|
||||
"-A", string(kubeServicesChain),
|
||||
"-m", "udp", "-p", "udp",
|
||||
"-m", "set", "--match-set", proxier.nodePortSetUDP.Name,
|
||||
"dst",
|
||||
)
|
||||
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
||||
}
|
||||
|
||||
// Write the end-of-table markers.
|
||||
writeLine(proxier.natRules, "COMMIT")
|
||||
|
||||
@@ -1411,7 +1671,6 @@ func (proxier *Proxier) cleanLegacyService(atciveServices map[string]bool, curre
|
||||
}
|
||||
|
||||
// linkKubeServiceChain will Create chain KUBE-SERVICES and link the chin in PREROUTING and OUTPUT
|
||||
// If not specify masqueradeAll or clusterCIDR or LB source range, won't create them.
|
||||
|
||||
// Chain PREROUTING (policy ACCEPT)
|
||||
// target prot opt source destination
|
||||
@@ -1451,6 +1710,55 @@ func (proxier *Proxier) linkKubeServiceChain(existingNATChains map[utiliptables.
|
||||
return nil
|
||||
}
|
||||
|
||||
//// linkKubeIPSetsChain will Create chain KUBE-SVC-IPSETS and link the chin in KUBE-SERVICES
|
||||
//
|
||||
//// Chain KUBE-SERVICES (policy ACCEPT)
|
||||
//// target prot opt source destination
|
||||
//// KUBE-SVC-IPSETS all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-SERVICE-ACCESS dst,dst
|
||||
//
|
||||
//// Chain KUBE-SVC-IPSETS (1 references)
|
||||
//// target prot opt source destination
|
||||
//// KUBE-MARK-MASQ all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-EXTERNAL-IP dst,dst
|
||||
//// ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-EXTERNAL-IP dst,dst PHYSDEV match ! --physdev-is-in ADDRTYPE match src-type !LOCAL
|
||||
//// ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-EXTERNAL-IP dst,dst ADDRTYPE match dst-type LOCAL
|
||||
//// ...
|
||||
//func (proxier *Proxier) linkKubeIPSetsChain(existingNATChains map[utiliptables.Chain]string, natChains *bytes.Buffer) error {
|
||||
// if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, KubeServiceIPSetsChain); err != nil {
|
||||
// return fmt.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, KubeServiceIPSetsChain, err)
|
||||
// }
|
||||
//
|
||||
// // TODO: iptables comment message for ipset?
|
||||
// // The hash:ip,port type of sets require two src/dst parameters of the set match and SET target kernel modules.
|
||||
// args := []string{"-m", "set", "--match-set", proxier.kubeServiceAccessSet.Name, "dst,dst", "-j", string(KubeServiceIPSetsChain)}
|
||||
// if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, kubeServicesChain, args...); err != nil {
|
||||
// return fmt.Errorf("Failed to ensure that ipset %s chain %s jumps to %s: %v", proxier.kubeServiceAccessSet.Name, kubeServicesChain, KubeServiceIPSetsChain, err)
|
||||
// }
|
||||
//
|
||||
// // equal to `iptables -t nat -N KUBE-SVC-IPSETS`
|
||||
// // write `:KUBE-SERVICES - [0:0]` in nat table
|
||||
// if chain, ok := existingNATChains[KubeServiceIPSetsChain]; ok {
|
||||
// writeLine(natChains, chain)
|
||||
// } else {
|
||||
// writeLine(natChains, utiliptables.MakeChainLine(KubeServiceIPSetsChain))
|
||||
// }
|
||||
// return nil
|
||||
//}
|
||||
|
||||
func (proxier *Proxier) createKubeFireWallChain(existingNATChains map[utiliptables.Chain]string, natChains *bytes.Buffer) error {
|
||||
// `iptables -t nat -N KUBE-FIRE-WALL`
|
||||
if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, KubeFireWallChain); err != nil {
|
||||
return fmt.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, KubeFireWallChain, err)
|
||||
}
|
||||
|
||||
// write `:KUBE-FIRE-WALL - [0:0]` in nat table
|
||||
if chain, ok := existingNATChains[KubeFireWallChain]; ok {
|
||||
writeLine(natChains, chain)
|
||||
} else {
|
||||
writeLine(natChains, utiliptables.MakeChainLine(KubeFireWallChain))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Join all words with spaces, terminate with newline and write to buff.
|
||||
func writeLine(buf *bytes.Buffer, words ...string) {
|
||||
// We avoid strings.Join for performance reasons.
|
||||
|
||||
Reference in New Issue
Block a user