From c4d6f75e35be7d84630df141e3efafab6f06ba69 Mon Sep 17 00:00:00 2001 From: "Dr. Stefan Schimanski" Date: Wed, 29 Jul 2015 18:00:26 +0200 Subject: [PATCH] Disable private mount ns for now in Mesos hyperkube minion server Until Docker learns parent mount namespace customization the container will always have the root ns as a parent, not the one of the km minion. Hence, the kubelet (which lives in the km minion mount ns) will create mounts that cannot be seen by the Docker containers. This feature can be enabled again when Docker learns to explicitly set the parent mount ns, in analogy to the parent cgroup. --- contrib/mesos/pkg/minion/mountns_linux.go | 16 ++++++++++++++++ contrib/mesos/pkg/minion/server.go | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/contrib/mesos/pkg/minion/mountns_linux.go b/contrib/mesos/pkg/minion/mountns_linux.go index 17f09313113..cad4976d25f 100644 --- a/contrib/mesos/pkg/minion/mountns_linux.go +++ b/contrib/mesos/pkg/minion/mountns_linux.go @@ -22,7 +22,23 @@ import ( log "github.com/golang/glog" ) +// enterPrivateMountNamespace does just that: the current mount ns is unshared (isolated) +// and then made a slave to the root mount / of the parent mount ns (mount events from / +// or its children that happen in the parent NS propagate to us). +// +// this is not yet compatible with volume plugins as implemented by the kubelet, which +// depends on using host-volume args to 'docker run' to attach plugin volumes to CT's +// at runtime. as such, docker needs to be able to see the volumes mounted by k8s plugins, +// which is impossible if k8s volume plugins are running in an isolated mount ns. +// +// an alternative approach would be to always run the kubelet in the host's mount-ns and +// rely upon mesos to forcibly umount bindings in the task sandbox before rmdir'ing it: +// https://issues.apache.org/jira/browse/MESOS-349. +// +// use at your own risk. func enterPrivateMountNamespace() { + log.Warningln("EXPERIMENTAL FEATURE: entering private mount ns") + // enter a new mount NS, useful for isolating changes to the mount table // that are made by the kubelet for storage volumes. err := syscall.Unshare(syscall.CLONE_NEWNS) diff --git a/contrib/mesos/pkg/minion/server.go b/contrib/mesos/pkg/minion/server.go index e93ba3113c5..5ace2a88746 100644 --- a/contrib/mesos/pkg/minion/server.go +++ b/contrib/mesos/pkg/minion/server.go @@ -62,7 +62,7 @@ type MinionServer struct { func NewMinionServer() *MinionServer { s := &MinionServer{ KubeletExecutorServer: exservice.NewKubeletExecutorServer(), - privateMountNS: true, + privateMountNS: false, // disabled until Docker supports customization of the parent mount namespace done: make(chan struct{}), exit: make(chan error), @@ -257,7 +257,7 @@ func (ms *MinionServer) AddExecutorFlags(fs *pflag.FlagSet) { func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) { // general minion flags - fs.BoolVar(&ms.privateMountNS, "private-mountns", ms.privateMountNS, "Enter a private mount NS before spawning procs (linux only).") + fs.BoolVar(&ms.privateMountNS, "private-mountns", ms.privateMountNS, "Enter a private mount NS before spawning procs (linux only). Experimental, not yet compatible with k8s volumes.") // log file flags fs.Var(resource.NewQuantityFlagValue(&ms.logMaxSize), "max-log-size", "Maximum log file size for the executor and proxy before rotation")