avoid updating nf_conntrack-related settings, by default, when running k8s on mesos

This commit is contained in:
James DeFelice 2016-01-05 22:45:34 +00:00
parent e663dbc302
commit 58407ca8a2
3 changed files with 63 additions and 39 deletions

View File

@ -22,6 +22,16 @@ It is **strongly** recommended that all of the Kubernetes-Mesos executors are de
Not following the above steps prior to upgrading the scheduler can result in a cluster wherein pods will never again be scheduled upon one or more nodes. Not following the above steps prior to upgrading the scheduler can result in a cluster wherein pods will never again be scheduled upon one or more nodes.
This issue is being tracked here: https://github.com/mesosphere/kubernetes-mesos/issues/572. This issue is being tracked here: https://github.com/mesosphere/kubernetes-mesos/issues/572.
### Netfilter Connection Tracking
The scheduler offers flags to tweak connection tracking for kube-proxy instances that are launched on slave nodes:
- conntrack-max (do **NOT** set this to a non-zero value if the Mesos slave process is running in a non-root network namespace)
- conntrack-tcp-timeout-established
By default both of these are set to 0 when running Kubernetes-Mesos.
Setting either of these flags to non-zero values may impact connection tracking for the entire slave.
### Port Specifications ### Port Specifications
In order for pods (replicated, or otherwise) to be scheduled on the cluster, it is strongly recommended that: In order for pods (replicated, or otherwise) to be scheduled on the cluster, it is strongly recommended that:

View File

@ -23,6 +23,7 @@ import (
"os" "os"
"os/signal" "os/signal"
"path" "path"
"strconv"
"strings" "strings"
"syscall" "syscall"
@ -66,10 +67,12 @@ type MinionServer struct {
logMaxAgeInDays int logMaxAgeInDays int
logVerbosity int32 // see glog.Level logVerbosity int32 // see glog.Level
runProxy bool runProxy bool
proxyLogV int proxyLogV int
proxyBindall bool proxyBindall bool
proxyMode string proxyMode string
conntrackMax int
conntrackTCPTimeoutEstablished int
} }
// NewMinionServer creates the MinionServer struct with default values to be used by hyperkube // NewMinionServer creates the MinionServer struct with default values to be used by hyperkube
@ -139,11 +142,8 @@ func (ms *MinionServer) launchProxyServer() {
"--logtostderr=true", "--logtostderr=true",
"--resource-container=" + path.Join("/", ms.mesosCgroup, "kube-proxy"), "--resource-container=" + path.Join("/", ms.mesosCgroup, "kube-proxy"),
"--proxy-mode=" + ms.proxyMode, "--proxy-mode=" + ms.proxyMode,
// TODO(jdef) this is a temporary hack to fix failing smoke tests. a following PR "--conntrack-max=" + strconv.Itoa(ms.conntrackMax),
// will more properly fix the smoke tests as well as make these flags configrable "--conntrack-tcp-timeout-established=" + strconv.Itoa(ms.conntrackTCPTimeoutEstablished),
// at the framework level (as opposed to hardcoded here)
"--conntrack-max=0",
"--conntrack-tcp-timeout-established=0",
} }
if ms.clientConfig.Host != "" { if ms.clientConfig.Host != "" {
@ -351,4 +351,6 @@ func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) {
fs.IntVar(&ms.proxyLogV, "proxy-logv", ms.proxyLogV, "Log verbosity of the child kube-proxy.") fs.IntVar(&ms.proxyLogV, "proxy-logv", ms.proxyLogV, "Log verbosity of the child kube-proxy.")
fs.BoolVar(&ms.proxyBindall, "proxy-bindall", ms.proxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.") fs.BoolVar(&ms.proxyBindall, "proxy-bindall", ms.proxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.")
fs.StringVar(&ms.proxyMode, "proxy-mode", ms.proxyMode, "Which proxy mode to use: 'userspace' (older) or 'iptables' (faster). If the iptables proxy is selected, regardless of how, but the system's kernel or iptables versions are insufficient, this always falls back to the userspace proxy.") fs.StringVar(&ms.proxyMode, "proxy-mode", ms.proxyMode, "Which proxy mode to use: 'userspace' (older) or 'iptables' (faster). If the iptables proxy is selected, regardless of how, but the system's kernel or iptables versions are insufficient, this always falls back to the userspace proxy.")
fs.IntVar(&ms.conntrackMax, "conntrack-max", ms.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)")
fs.IntVar(&ms.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", ms.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)")
} }

View File

@ -133,36 +133,38 @@ type SchedulerServer struct {
minionLogMaxBackups int minionLogMaxBackups int
minionLogMaxAgeInDays int minionLogMaxAgeInDays int
mesosAuthProvider string mesosAuthProvider string
driverPort uint driverPort uint
hostnameOverride string hostnameOverride string
reconcileInterval int64 reconcileInterval int64
reconcileCooldown time.Duration reconcileCooldown time.Duration
defaultContainerCPULimit mresource.CPUShares defaultContainerCPULimit mresource.CPUShares
defaultContainerMemLimit mresource.MegaBytes defaultContainerMemLimit mresource.MegaBytes
schedulerConfigFileName string schedulerConfigFileName string
graceful bool graceful bool
frameworkName string frameworkName string
frameworkWebURI string frameworkWebURI string
ha bool ha bool
advertisedAddress string advertisedAddress string
serviceAddress net.IP serviceAddress net.IP
haDomain string haDomain string
kmPath string kmPath string
clusterDNS net.IP clusterDNS net.IP
clusterDomain string clusterDomain string
kubeletRootDirectory string kubeletRootDirectory string
kubeletDockerEndpoint string kubeletDockerEndpoint string
kubeletPodInfraContainerImage string kubeletPodInfraContainerImage string
kubeletCadvisorPort uint kubeletCadvisorPort uint
kubeletHostNetworkSources string kubeletHostNetworkSources string
kubeletSyncFrequency time.Duration kubeletSyncFrequency time.Duration
kubeletNetworkPluginName string kubeletNetworkPluginName string
staticPodsConfigPath string staticPodsConfigPath string
dockerCfgPath string dockerCfgPath string
containPodResources bool containPodResources bool
nodeRelistPeriod time.Duration nodeRelistPeriod time.Duration
sandboxOverlay string sandboxOverlay string
conntrackMax int
conntrackTCPTimeoutEstablished int
executable string // path to the binary running this service executable string // path to the binary running this service
client *client.Client client *client.Client
@ -216,6 +218,12 @@ func NewSchedulerServer() *SchedulerServer {
kubeletEnableDebuggingHandlers: true, kubeletEnableDebuggingHandlers: true,
containPodResources: true, containPodResources: true,
nodeRelistPeriod: defaultNodeRelistPeriod, nodeRelistPeriod: defaultNodeRelistPeriod,
conntrackTCPTimeoutEstablished: 0, // non-zero values may require hand-tuning other sysctl's on the host; do so with caution
// non-zero values can trigger failures when updating /sys/module/nf_conntrack/parameters/hashsize
// when kube-proxy is running in a non-root netns (init_net); setting this to a non-zero value will
// impact connection tracking for the entire host on which kube-proxy is running. xref (k8s#19182)
conntrackMax: 0,
} }
// cache this for later use. also useful in case the original binary gets deleted, e.g. // cache this for later use. also useful in case the original binary gets deleted, e.g.
// during upgrades, development deployments, etc. // during upgrades, development deployments, etc.
@ -294,6 +302,8 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config") fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config")
fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle") fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
fs.BoolVar(&s.kubeletEnableDebuggingHandlers, "kubelet-enable-debugging-handlers", s.kubeletEnableDebuggingHandlers, "Enables kubelet endpoints for log collection and local running of containers and commands") fs.BoolVar(&s.kubeletEnableDebuggingHandlers, "kubelet-enable-debugging-handlers", s.kubeletEnableDebuggingHandlers, "Enables kubelet endpoints for log collection and local running of containers and commands")
fs.IntVar(&s.conntrackMax, "conntrack-max", s.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)")
fs.IntVar(&s.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", s.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)")
//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration //TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.") //fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
@ -413,6 +423,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.kubeletEnableDebuggingHandlers)) ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.kubeletEnableDebuggingHandlers))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--conntrack-max=%d", s.conntrackMax))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--conntrack-tcp-timeout-established=%d", s.conntrackTCPTimeoutEstablished))
if s.authPath != "" { if s.authPath != "" {
//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file //TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file