Merge pull request #19303 from mesosphere/jdef-expose-conntrack-flags-to-framework

Auto commit by PR queue bot
This commit is contained in:
k8s-merge-robot 2016-01-06 16:21:18 -08:00
commit a11d1bdc90
3 changed files with 63 additions and 39 deletions

View File

@ -22,6 +22,16 @@ It is **strongly** recommended that all of the Kubernetes-Mesos executors are de
Not following the above steps prior to upgrading the scheduler can result in a cluster wherein pods will never again be scheduled upon one or more nodes.
This issue is being tracked here: https://github.com/mesosphere/kubernetes-mesos/issues/572.
### Netfilter Connection Tracking
The scheduler offers flags to tweak connection tracking for kube-proxy instances that are launched on slave nodes:
- conntrack-max (do **NOT** set this to a non-zero value if the Mesos slave process is running in a non-root network namespace)
- conntrack-tcp-timeout-established
By default both of these are set to 0 when running Kubernetes-Mesos.
Setting either of these flags to non-zero values may impact connection tracking for the entire slave.
### Port Specifications
In order for pods (replicated, or otherwise) to be scheduled on the cluster, it is strongly recommended that:

View File

@ -23,6 +23,7 @@ import (
"os"
"os/signal"
"path"
"strconv"
"strings"
"syscall"
@ -66,10 +67,12 @@ type MinionServer struct {
logMaxAgeInDays int
logVerbosity int32 // see glog.Level
runProxy bool
proxyLogV int
proxyBindall bool
proxyMode string
runProxy bool
proxyLogV int
proxyBindall bool
proxyMode string
conntrackMax int
conntrackTCPTimeoutEstablished int
}
// NewMinionServer creates the MinionServer struct with default values to be used by hyperkube
@ -139,11 +142,8 @@ func (ms *MinionServer) launchProxyServer() {
"--logtostderr=true",
"--resource-container=" + path.Join("/", ms.mesosCgroup, "kube-proxy"),
"--proxy-mode=" + ms.proxyMode,
// TODO(jdef) this is a temporary hack to fix failing smoke tests. a following PR
// will more properly fix the smoke tests as well as make these flags configrable
// at the framework level (as opposed to hardcoded here)
"--conntrack-max=0",
"--conntrack-tcp-timeout-established=0",
"--conntrack-max=" + strconv.Itoa(ms.conntrackMax),
"--conntrack-tcp-timeout-established=" + strconv.Itoa(ms.conntrackTCPTimeoutEstablished),
}
if ms.clientConfig.Host != "" {
@ -351,4 +351,6 @@ func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) {
fs.IntVar(&ms.proxyLogV, "proxy-logv", ms.proxyLogV, "Log verbosity of the child kube-proxy.")
fs.BoolVar(&ms.proxyBindall, "proxy-bindall", ms.proxyBindall, "When true will cause kube-proxy to bind to 0.0.0.0.")
fs.StringVar(&ms.proxyMode, "proxy-mode", ms.proxyMode, "Which proxy mode to use: 'userspace' (older) or 'iptables' (faster). If the iptables proxy is selected, regardless of how, but the system's kernel or iptables versions are insufficient, this always falls back to the userspace proxy.")
fs.IntVar(&ms.conntrackMax, "conntrack-max", ms.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)")
fs.IntVar(&ms.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", ms.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)")
}

View File

@ -133,36 +133,38 @@ type SchedulerServer struct {
minionLogMaxBackups int
minionLogMaxAgeInDays int
mesosAuthProvider string
driverPort uint
hostnameOverride string
reconcileInterval int64
reconcileCooldown time.Duration
defaultContainerCPULimit mresource.CPUShares
defaultContainerMemLimit mresource.MegaBytes
schedulerConfigFileName string
graceful bool
frameworkName string
frameworkWebURI string
ha bool
advertisedAddress string
serviceAddress net.IP
haDomain string
kmPath string
clusterDNS net.IP
clusterDomain string
kubeletRootDirectory string
kubeletDockerEndpoint string
kubeletPodInfraContainerImage string
kubeletCadvisorPort uint
kubeletHostNetworkSources string
kubeletSyncFrequency time.Duration
kubeletNetworkPluginName string
staticPodsConfigPath string
dockerCfgPath string
containPodResources bool
nodeRelistPeriod time.Duration
sandboxOverlay string
mesosAuthProvider string
driverPort uint
hostnameOverride string
reconcileInterval int64
reconcileCooldown time.Duration
defaultContainerCPULimit mresource.CPUShares
defaultContainerMemLimit mresource.MegaBytes
schedulerConfigFileName string
graceful bool
frameworkName string
frameworkWebURI string
ha bool
advertisedAddress string
serviceAddress net.IP
haDomain string
kmPath string
clusterDNS net.IP
clusterDomain string
kubeletRootDirectory string
kubeletDockerEndpoint string
kubeletPodInfraContainerImage string
kubeletCadvisorPort uint
kubeletHostNetworkSources string
kubeletSyncFrequency time.Duration
kubeletNetworkPluginName string
staticPodsConfigPath string
dockerCfgPath string
containPodResources bool
nodeRelistPeriod time.Duration
sandboxOverlay string
conntrackMax int
conntrackTCPTimeoutEstablished int
executable string // path to the binary running this service
client *client.Client
@ -216,6 +218,12 @@ func NewSchedulerServer() *SchedulerServer {
kubeletEnableDebuggingHandlers: true,
containPodResources: true,
nodeRelistPeriod: defaultNodeRelistPeriod,
conntrackTCPTimeoutEstablished: 0, // non-zero values may require hand-tuning other sysctl's on the host; do so with caution
// non-zero values can trigger failures when updating /sys/module/nf_conntrack/parameters/hashsize
// when kube-proxy is running in a non-root netns (init_net); setting this to a non-zero value will
// impact connection tracking for the entire host on which kube-proxy is running. xref (k8s#19182)
conntrackMax: 0,
}
// cache this for later use. also useful in case the original binary gets deleted, e.g.
// during upgrades, development deployments, etc.
@ -294,6 +302,8 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
fs.DurationVar(&s.kubeletSyncFrequency, "kubelet-sync-frequency", s.kubeletSyncFrequency, "Max period between synchronizing running containers and config")
fs.StringVar(&s.kubeletNetworkPluginName, "kubelet-network-plugin", s.kubeletNetworkPluginName, "<Warning: Alpha feature> The name of the network plugin to be invoked for various events in kubelet/pod lifecycle")
fs.BoolVar(&s.kubeletEnableDebuggingHandlers, "kubelet-enable-debugging-handlers", s.kubeletEnableDebuggingHandlers, "Enables kubelet endpoints for log collection and local running of containers and commands")
fs.IntVar(&s.conntrackMax, "conntrack-max", s.conntrackMax, "Maximum number of NAT connections to track on agent nodes (0 to leave as-is)")
fs.IntVar(&s.conntrackTCPTimeoutEstablished, "conntrack-tcp-timeout-established", s.conntrackTCPTimeoutEstablished, "Idle timeout for established TCP connections on agent nodes (0 to leave as-is)")
//TODO(jdef) support this flag once we have a better handle on mesos-dns and k8s DNS integration
//fs.StringVar(&s.HADomain, "ha-domain", s.HADomain, "Domain of the HA scheduler service, only used in HA mode. If specified may be used to construct artifact download URIs.")
@ -413,6 +423,8 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.kubeletSyncFrequency))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.containPodResources))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--enable-debugging-handlers=%t", s.kubeletEnableDebuggingHandlers))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--conntrack-max=%d", s.conntrackMax))
ci.Arguments = append(ci.Arguments, fmt.Sprintf("--conntrack-tcp-timeout-established=%d", s.conntrackTCPTimeoutEstablished))
if s.authPath != "" {
//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file