From 58407ca8a2c37b798be99dc91c5b871f4a18a23a Mon Sep 17 00:00:00 2001 From: James DeFelice Date: Tue, 5 Jan 2016 22:45:34 +0000 Subject: [PATCH] avoid updating nf_conntrack-related settings, by default, when running k8s on mesos --- contrib/mesos/docs/issues.md | 10 +++ contrib/mesos/pkg/minion/server.go | 20 +++--- .../mesos/pkg/scheduler/service/service.go | 72 +++++++++++-------- 3 files changed, 63 insertions(+), 39 deletions(-) diff --git a/contrib/mesos/docs/issues.md b/contrib/mesos/docs/issues.md index 1ee287af68f..5730127de60 100644 --- a/contrib/mesos/docs/issues.md +++ b/contrib/mesos/docs/issues.md @@ -22,6 +22,16 @@ It is **strongly** recommended that all of the Kubernetes-Mesos executors are de Not following the above steps prior to upgrading the scheduler can result in a cluster wherein pods will never again be scheduled upon one or more nodes. This issue is being tracked here: https://github.com/mesosphere/kubernetes-mesos/issues/572. +### Netfilter Connection Tracking + +The scheduler offers flags to tweak connection tracking for kube-proxy instances that are launched on slave nodes: + +- conntrack-max (do **NOT** set this to a non-zero value if the Mesos slave process is running in a non-root network namespace) +- conntrack-tcp-timeout-established + +By default both of these are set to 0 when running Kubernetes-Mesos. +Setting either of these flags to non-zero values may impact connection tracking for the entire slave. + ### Port Specifications In order for pods (replicated, or otherwise) to be scheduled on the cluster, it is strongly recommended that: diff --git a/contrib/mesos/pkg/minion/server.go b/contrib/mesos/pkg/minion/server.go index 7db66981a11..6c70e046bf6 100644 --- a/contrib/mesos/pkg/minion/server.go +++ b/contrib/mesos/pkg/minion/server.go @@ -23,6 +23,7 @@ import ( "os" "os/signal" "path" + "strconv" "strings" "syscall" @@ -66,10 +67,12 @@ type MinionServer struct { logMaxAgeInDays int logVerbosity int32 // see glog.Level - runProxy bool - proxyLogV int - proxyBindall bool - proxyMode string + runProxy bool + proxyLogV int + proxyBindall bool + proxyMode string + conntrackMax int + conntrackTCPTimeoutEstablished int } // NewMinionServer creates the MinionServer struct with default values to be used by hyperkube @@ -139,11 +142,8 @@ func (ms *MinionServer) launchProxyServer() { "--logtostderr=true", "--resource-container=" + path.Join("/", ms.mesosCgroup, "kube-proxy"), "--proxy-mode=" + ms.proxyMode, - // TODO(jdef) this is a temporary hack to fix failing smoke tests. a following PR - // will more properly fix the smoke tests as well as make these flags configrable - // at the framework level (as opposed to hardcoded here) - "--conntrack-max=0", - "--conntrack-tcp-timeout-established=0", + "--conntrack-max=" + strconv.Itoa(ms.conntrackMax), + "--conntrack-tcp-timeout-established=" + strconv.Itoa(ms.conntrackTCPTimeoutEstablished), } if ms.clientConfig.Host != "" { @@ -351,4 +351,6 @@ func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) { fs.IntVar(&ms.proxyLogV, "proxy-logv", ms.proxyLogV, "Log verbosity of the child kube-proxy.") fs.BoolVar(&ms.proxyBindall, "proxy-bindall", ms.proxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.") fs.StringVar(&ms.proxyMode, "proxy-mode", ms.proxyMode, "Which proxy mode to use: 'userspace' (older) or 'iptables' (faster). If the iptables proxy is selected, regardless of how, but the system's kernel or iptables versions are insufficient, this always falls back to the userspace proxy.") + fs.IntVar(&ms.conntrackMax, "conntrack-max", ms.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)") + fs.IntVar(&ms.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", ms.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)") } diff --git a/contrib/mesos/pkg/scheduler/service/service.go b/contrib/mesos/pkg/scheduler/service/service.go index fc71ad4ff87..7fade99abaa 100644 --- a/contrib/mesos/pkg/scheduler/service/service.go +++ b/contrib/mesos/pkg/scheduler/service/service.go @@ -133,36 +133,38 @@ type SchedulerServer struct { minionLogMaxBackups int minionLogMaxAgeInDays int - mesosAuthProvider string - driverPort uint - hostnameOverride string - reconcileInterval int64 - reconcileCooldown time.Duration - defaultContainerCPULimit mresource.CPUShares - defaultContainerMemLimit mresource.MegaBytes - schedulerConfigFileName string - graceful bool - frameworkName string - frameworkWebURI string - ha bool - advertisedAddress string - serviceAddress net.IP - haDomain string - kmPath string - clusterDNS net.IP - clusterDomain string - kubeletRootDirectory string - kubeletDockerEndpoint string - kubeletPodInfraContainerImage string - kubeletCadvisorPort uint - kubeletHostNetworkSources string - kubeletSyncFrequency time.Duration - kubeletNetworkPluginName string - staticPodsConfigPath string - dockerCfgPath string - containPodResources bool - nodeRelistPeriod time.Duration - sandboxOverlay string + mesosAuthProvider string + driverPort uint + hostnameOverride string + reconcileInterval int64 + reconcileCooldown time.Duration + defaultContainerCPULimit mresource.CPUShares + defaultContainerMemLimit mresource.MegaBytes + schedulerConfigFileName string + graceful bool + frameworkName string + frameworkWebURI string + ha bool + advertisedAddress string + serviceAddress net.IP + haDomain string + kmPath string + clusterDNS net.IP + clusterDomain string + kubeletRootDirectory string + kubeletDockerEndpoint string + kubeletPodInfraContainerImage string + kubeletCadvisorPort uint + kubeletHostNetworkSources string + kubeletSyncFrequency time.Duration + kubeletNetworkPluginName string + staticPodsConfigPath string + dockerCfgPath string + containPodResources bool + nodeRelistPeriod time.Duration + sandboxOverlay string + conntrackMax int + conntrackTCPTimeoutEstablished int executable string // path to the binary running this service client *client.Client @@ -216,6 +218,12 @@ func NewSchedulerServer() *SchedulerServer { kubeletEnableDebuggingHandlers: true, containPodResources: true, nodeRelistPeriod: defaultNodeRelistPeriod, + conntrackTCPTimeoutEstablished: 0, // non-zero values may require hand-tuning other sysctl's on the host; do so with caution + + // non-zero values can trigger failures when updating /sys/module/nf_conntrack/parameters/hashsize + // when kube-proxy is running in a non-root netns (init_net); setting this to a non-zero value will + // impact connection tracking for the entire host on which kube-proxy is running. xref (k8s#19182) + conntrackMax: 0, } // cache this for later use. also useful in case the original binary gets deleted, e.g. // during upgrades, development deployments, etc. @@ -294,6 +302,8 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) { fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config") fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, " The name of the network plugin to be invoked for various events in kubelet/pod lifecycle") fs.BoolVar(&s.kubeletEnableDebuggingHandlers, "kubelet-enable-debugging-handlers", s.kubeletEnableDebuggingHandlers, "Enables kubelet endpoints for log collection and local running of containers and commands") + fs.IntVar(&s.conntrackMax, "conntrack-max", s.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)") + fs.IntVar(&s.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", s.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)") //TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration //fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.") @@ -413,6 +423,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.kubeletEnableDebuggingHandlers)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--conntrack-max=%d", s.conntrackMax)) + ci.Arguments = append(ci.Arguments, fmt.Sprintf("--conntrack-tcp-timeout-established=%d", s.conntrackTCPTimeoutEstablished)) if s.authPath != "" { //TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file