Merge pull request #13421 from mesosphere/optout_pod_resource_constraints

MESOS: Add --contain-pod-resources flag to determine how k8s-mesos will contain pod resources
2025-07-27 21:47:07 +00:00 · 2015-09-04 13:43:22 +02:00 · 2015-09-04 13:43:22 +02:00 · 47af1a0c9b
commit 47af1a0c9b
parent 7fc1fe2dd9 a1cea8dd87
14 changed files with 517 additions and 233 deletions
--- a/contrib/mesos/pkg/minion/server.go
+++ b/contrib/mesos/pkg/minion/server.go
@ -57,11 +57,14 @@ type MinionServer struct {
 	pathOverride        string // the PATH environment for the sub-processes
 	cgroupPrefix        string // e.g. mesos
-	cgroupRoot   string // e.g. /mesos/{container-id}, determined at runtime
+	cgroupRoot          string // the cgroupRoot that we pass to the kubelet-executor, depends on containPodResources
 	mesosCgroup         string // discovered mesos cgroup root, e.g. /mesos/{container-id}
 	containPodResources bool
 	logMaxSize      resource.Quantity
 	logMaxBackups   int
 	logMaxAgeInDays int
 	logVerbosity    int32 // see glog.Level
 	runProxy     bool
 	proxyLogV    int
@ -74,6 +77,7 @@ func NewMinionServer() *MinionServer {
 		KubeletExecutorServer: exservice.NewKubeletExecutorServer(),
 		privateMountNS:        false, // disabled until Docker supports customization of the parent mount namespace
 		cgroupPrefix:          config.DefaultCgroupPrefix,
 		containPodResources:   true,
 		logMaxSize:            config.DefaultLogMaxSize(),
 		logMaxBackups:         config.DefaultLogMaxBackups,
 		logMaxAgeInDays:       config.DefaultLogMaxAgeInDays,
@ -131,7 +135,7 @@ func (ms *MinionServer) launchProxyServer() {
 		fmt.Sprintf("--bind-address=%s", bindAddress),
 		fmt.Sprintf("--v=%d", ms.proxyLogV),
 		"--logtostderr=true",
-		"--resource-container=" + path.Join("/", ms.cgroupRoot, "kube-proxy"),
+		"--resource-container=" + path.Join("/", ms.mesosCgroup, "kube-proxy"),
 	}
 	if ms.clientConfig.Host != "" {
@ -156,7 +160,7 @@ func (ms *MinionServer) launchExecutorServer() <-chan struct{} {
 	ms.AddExecutorFlags(executorFlags)
 	executorArgs, _ := filterArgsByFlagSet(allArgs, executorFlags)
-	executorArgs = append(executorArgs, "--resource-container="+path.Join("/", ms.cgroupRoot, "kubelet"))
+	executorArgs = append(executorArgs, "--resource-container="+path.Join("/", ms.mesosCgroup, "kubelet"))
 	if ms.cgroupRoot != "" {
 		executorArgs = append(executorArgs, "--cgroup-root="+ms.cgroupRoot)
 	}
@ -241,14 +245,23 @@ func (ms *MinionServer) Run(hks hyperkube.Interface, _ []string) error {
 	ms.clientConfig = clientConfig
 	// derive the executor cgroup and use it as:
-	// - pod container cgroup root (e.g. docker cgroup-parent)
+	// - pod container cgroup root (e.g. docker cgroup-parent, optionally; see comments below)
 	// - parent of kubelet container
 	// - parent of kube-proxy container
-	ms.cgroupRoot = findMesosCgroup(ms.cgroupPrefix)
+	ms.mesosCgroup = findMesosCgroup(ms.cgroupPrefix)
 	log.Infof("discovered mesos cgroup at %q", ms.mesosCgroup)
 	// hack alert, this helps to work around systemd+docker+mesos integration problems
 	// when docker's cgroup-parent flag is used (!containPodResources = don't use the docker flag)
 	if ms.containPodResources {
 		ms.cgroupRoot = ms.mesosCgroup
 	}
 	cgroupLogger := log.Infof
 	if ms.cgroupRoot == "" {
 		cgroupLogger = log.Warningf
 	}
 	cgroupLogger("using cgroup-root %q", ms.cgroupRoot)
 	// run subprocesses until ms.done is closed on return of this function
@ -302,6 +315,9 @@ func termSignalListener(abort <-chan struct{}) <-chan struct{} {
 func (ms *MinionServer) AddExecutorFlags(fs *pflag.FlagSet) {
 	ms.KubeletExecutorServer.AddFlags(fs)
 	// hack to forward log verbosity flag to the executor
 	fs.Int32Var(&ms.logVerbosity, "v", ms.logVerbosity, "log level for V logs")
 }
 func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) {
@ -309,6 +325,7 @@ func (ms *MinionServer) AddMinionFlags(fs *pflag.FlagSet) {
 	fs.StringVar(&ms.cgroupPrefix, "mesos-cgroup-prefix", ms.cgroupPrefix, "The cgroup prefix concatenated with MESOS_DIRECTORY must give the executor cgroup set by Mesos")
 	fs.BoolVar(&ms.privateMountNS, "private-mountns", ms.privateMountNS, "Enter a private mount NS before spawning procs (linux only). Experimental, not yet compatible with k8s volumes.")
 	fs.StringVar(&ms.pathOverride, "path-override", ms.pathOverride, "Override the PATH in the environment of the sub-processes.")
 	fs.BoolVar(&ms.containPodResources, "contain-pod-resources", ms.containPodResources, "Allocate pod CPU and memory resources from offers and reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
 	// log file flags
 	fs.Var(resource.NewQuantityFlagValue(&ms.logMaxSize), "max-log-size", "Maximum log file size for the executor and proxy before rotation")
--- a/contrib/mesos/pkg/scheduler/fcfs.go
+++ b/contrib/mesos/pkg/scheduler/fcfs.go
@ -24,8 +24,42 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 )
 type allocationStrategy struct {
 	fitPredicate podtask.FitPredicate
 	procurement  podtask.Procurement
 }
 func (a *allocationStrategy) FitPredicate() podtask.FitPredicate {
 	return a.fitPredicate
 }
 func (a *allocationStrategy) Procurement() podtask.Procurement {
 	return a.procurement
 }
 func NewAllocationStrategy(fitPredicate podtask.FitPredicate, procurement podtask.Procurement) AllocationStrategy {
 	if fitPredicate == nil {
 		panic("fitPredicate is required")
 	}
 	if procurement == nil {
 		panic("procurement is required")
 	}
 	return &allocationStrategy{
 		fitPredicate: fitPredicate,
 		procurement:  procurement,
 	}
 }
 type fcfsPodScheduler struct {
 	AllocationStrategy
 }
 func NewFCFSPodScheduler(as AllocationStrategy) PodScheduler {
 	return &fcfsPodScheduler{as}
 }
 // A first-come-first-serve scheduler: acquires the first offer that can support the task
-func FCFSScheduleFunc(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
+func (fps *fcfsPodScheduler) SchedulePod(r offers.Registry, unused SlaveIndex, task *podtask.T) (offers.Perishable, error) {
 	podName := fmt.Sprintf("%s/%s", task.Pod.Namespace, task.Pod.Name)
 	var acceptedOffer offers.Perishable
 	err := r.Walk(func(p offers.Perishable) (bool, error) {
@ -33,7 +67,7 @@ func FCFSScheduleFunc(r offers.Registry, unused SlaveIndex, task *podtask.T) (of
 		if offer == nil {
 			return false, fmt.Errorf("nil offer while scheduling task %v", task.ID)
 		}
-		if task.AcceptOffer(offer) {
+		if fps.FitPredicate()(task, offer) {
 			if p.Acquire() {
 				acceptedOffer = p
 				log.V(3).Infof("Pod %s accepted offer %v", podName, offer.Id.GetValue())
--- a/contrib/mesos/pkg/scheduler/mock_test.go
+++ b/contrib/mesos/pkg/scheduler/mock_test.go
@ -42,11 +42,11 @@ func (m *MockScheduler) slaveFor(id string) (slave *Slave, ok bool) {
 	ok = args.Bool(1)
 	return
 }
-func (m *MockScheduler) algorithm() (f PodScheduleFunc) {
+func (m *MockScheduler) algorithm() (f PodScheduler) {
 	args := m.Called()
 	x := args.Get(0)
 	if x != nil {
-		f = x.(PodScheduleFunc)
+		f = x.(PodScheduler)
 	}
 	return
 }
--- a/contrib/mesos/pkg/scheduler/plugin.go
+++ b/contrib/mesos/pkg/scheduler/plugin.go
@ -33,7 +33,6 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
 	annotation "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/errors"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
@ -56,8 +55,9 @@ const (
 // scheduler abstraction to allow for easier unit testing
 type schedulerInterface interface {
 	sync.Locker // synchronize scheduler plugin operations
 	SlaveIndex
-	algorithm() PodScheduleFunc // see types.go
+	algorithm() PodScheduler
 	offers() offers.Registry
 	tasks() podtask.Registry
@ -76,8 +76,8 @@ type k8smScheduler struct {
 	internal *KubernetesScheduler
 }
-func (k *k8smScheduler) algorithm() PodScheduleFunc {
+func (k *k8smScheduler) algorithm() PodScheduler {
-	return k.internal.scheduleFunc
+	return k.internal
 }
 func (k *k8smScheduler) offers() offers.Registry {
@ -233,8 +233,6 @@ func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *pod
 type kubeScheduler struct {
 	api        schedulerInterface
 	podUpdates queue.FIFO
 	defaultContainerCPULimit mresource.CPUShares
 	defaultContainerMemLimit mresource.MegaBytes
 }
 // recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
@ -318,7 +316,7 @@ func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) {
 		}
 	}
 	if err == nil && offer == nil {
-		offer, err = k.api.algorithm()(k.api.offers(), k.api, task)
+		offer, err = k.api.algorithm().SchedulePod(k.api.offers(), k.api, task)
 	}
 	if err != nil {
 		return "", err
@ -338,18 +336,8 @@ func (k *kubeScheduler) doSchedule(task *podtask.T, err error) (string, error) {
 			return "", fmt.Errorf("task.offer assignment must be idempotent, task %+v: offer %+v", task, offer)
 		}
 		// write resource limits into the pod spec which is transferred to the executor. From here
 		// on we can expect that the pod spec of a task has proper limits for CPU and memory.
 		// TODO(sttts): For a later separation of the kubelet and the executor also patch the pod on the apiserver
 		if unlimitedCPU := mresource.LimitPodCPU(&task.Pod, k.defaultContainerCPULimit); unlimitedCPU {
 			log.Warningf("Pod %s/%s without cpu limits is admitted %.2f cpu shares", task.Pod.Namespace, task.Pod.Name, mresource.PodCPULimit(&task.Pod))
 		}
 		if unlimitedMem := mresource.LimitPodMem(&task.Pod, k.defaultContainerMemLimit); unlimitedMem {
 			log.Warningf("Pod %s/%s without memory limits is admitted %.2f MB", task.Pod.Namespace, task.Pod.Name, mresource.PodMemLimit(&task.Pod))
 		}
 		task.Offer = offer
-		task.FillFromDetails(details)
+		k.api.algorithm().Procurement()(task, details) // TODO(jdef) why is nothing checking the error returned here?
 		if err := k.api.tasks().Update(task); err != nil {
 			offer.Release()
@ -556,7 +544,7 @@ func (k *errorHandler) handleSchedulingError(pod *api.Pod, schedulingErr error)
 				defer k.api.Unlock()
 				switch task, state := k.api.tasks().Get(task.ID); state {
 				case podtask.StatePending:
-					return !task.Has(podtask.Launched) && task.AcceptOffer(offer)
+					return !task.Has(podtask.Launched) && k.api.algorithm().FitPredicate()(task, offer)
 				default:
 					// no point in continuing to check for matching offers
 					return true
@ -700,8 +688,6 @@ func (k *KubernetesScheduler) NewPluginConfig(terminate <-chan struct{}, mux *ht
 			Algorithm: &kubeScheduler{
 				api:        kapi,
 				podUpdates: podUpdates,
 				defaultContainerCPULimit: k.defaultContainerCPULimit,
 				defaultContainerMemLimit: k.defaultContainerMemLimit,
 			},
 			Binder:   &binder{api: kapi},
 			NextPod:  q.yield,
--- a/contrib/mesos/pkg/scheduler/plugin_test.go
+++ b/contrib/mesos/pkg/scheduler/plugin_test.go
@ -393,13 +393,14 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	executor.Data = []byte{0, 1, 2}
 	// create scheduler
 	as := NewAllocationStrategy(
 		podtask.DefaultPredicate,
 		podtask.NewDefaultProcurement(mresource.DefaultDefaultContainerCPULimit, mresource.DefaultDefaultContainerMemLimit))
 	testScheduler := New(Config{
 		Executor:  executor,
 		Client:    client.NewOrDie(&client.Config{Host: testApiServer.server.URL, Version: testapi.Version()}),
-		ScheduleFunc:             FCFSScheduleFunc,
+		Scheduler: NewFCFSPodScheduler(as),
 		Schedcfg:  *schedcfg.CreateDefaultConfig(),
 		DefaultContainerCPULimit: mresource.DefaultDefaultContainerCPULimit,
 		DefaultContainerMemLimit: mresource.DefaultDefaultContainerMemLimit,
 	})
 	assert.NotNil(testScheduler.client, "client is nil")
--- a/contrib/mesos/pkg/scheduler/podtask/minimal.go
+++ b/contrib/mesos/pkg/scheduler/podtask/minimal.go
@ -0,0 +1,73 @@
 /*
 Copyright 2015 The Kubernetes Authors All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package podtask
 import (
 	log "github.com/golang/glog"
 	mesos "github.com/mesos/mesos-go/mesosproto"
 )
 // bogus numbers that we use to make sure that there's some set of minimal offered resources on the slave
 const (
 	minimalCpus = 0.01
 	minimalMem  = 0.25
 )
 var (
 	DefaultMinimalPredicate = RequireAllPredicate([]FitPredicate{
 		ValidationPredicate,
 		NodeSelectorPredicate,
 		MinimalPodResourcesPredicate,
 		PortsPredicate,
 	}).Fit
 	DefaultMinimalProcurement = AllOrNothingProcurement([]Procurement{
 		ValidateProcurement,
 		NodeProcurement,
 		MinimalPodResourcesProcurement,
 		PortsProcurement,
 	}).Procure
 )
 func MinimalPodResourcesPredicate(t *T, offer *mesos.Offer) bool {
 	var (
 		offeredCpus float64
 		offeredMem  float64
 	)
 	for _, resource := range offer.Resources {
 		if resource.GetName() == "cpus" {
 			offeredCpus = resource.GetScalar().GetValue()
 		}
 		if resource.GetName() == "mem" {
 			offeredMem = resource.GetScalar().GetValue()
 		}
 	}
 	log.V(4).Infof("trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, minimalCpus, minimalMem)
 	if (minimalCpus > offeredCpus) || (minimalMem > offeredMem) {
 		log.V(3).Infof("not enough resources for pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, minimalCpus, minimalMem)
 		return false
 	}
 	return true
 }
 func MinimalPodResourcesProcurement(t *T, details *mesos.Offer) error {
 	log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", details.Id, t.Pod.Namespace, t.Pod.Name, minimalCpus, minimalMem)
 	t.Spec.CPU = minimalCpus
 	t.Spec.Memory = minimalMem
 	return nil
 }
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go
@ -18,7 +18,6 @@ package podtask
 import (
 	"fmt"
 	"strings"
 	"time"
 	"github.com/gogo/protobuf/proto"
@ -28,7 +27,6 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/labels"
 	log "github.com/golang/glog"
 	mesos "github.com/mesos/mesos-go/mesosproto"
@ -150,59 +148,6 @@ func (t *T) BuildTaskInfo() *mesos.TaskInfo {
 	return info
 }
 // Fill the Spec in the T, should be called during k8s scheduling, before binding.
 func (t *T) FillFromDetails(details *mesos.Offer) error {
 	if details == nil {
 		//programming error
 		panic("offer details are nil")
 	}
 	// compute used resources
 	cpu := mresource.PodCPULimit(&t.Pod)
 	mem := mresource.PodMemLimit(&t.Pod)
 	log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", details.Id, t.Pod.Namespace, t.Pod.Name, cpu, mem)
 	t.Spec = Spec{
 		SlaveID:       details.GetSlaveId().GetValue(),
 		AssignedSlave: details.GetHostname(),
 		CPU:           cpu,
 		Memory:        mem,
 	}
 	// fill in port mapping
 	if mapping, err := t.mapper.Generate(t, details); err != nil {
 		t.Reset()
 		return err
 	} else {
 		ports := []uint64{}
 		for _, entry := range mapping {
 			ports = append(ports, entry.OfferPort)
 		}
 		t.Spec.PortMap = mapping
 		t.Spec.Ports = ports
 	}
 	// hostname needs of the executor needs to match that of the offer, otherwise
 	// the kubelet node status checker/updater is very unhappy
 	const HOSTNAME_OVERRIDE_FLAG = "--hostname-override="
 	hostname := details.GetHostname() // required field, non-empty
 	hostnameOverride := HOSTNAME_OVERRIDE_FLAG + hostname
 	argv := t.executor.Command.Arguments
 	overwrite := false
 	for i, arg := range argv {
 		if strings.HasPrefix(arg, HOSTNAME_OVERRIDE_FLAG) {
 			overwrite = true
 			argv[i] = hostnameOverride
 			break
 		}
 	}
 	if !overwrite {
 		t.executor.Command.Arguments = append(argv, hostnameOverride)
 	}
 	return nil
 }
 // Clear offer-related details from the task, should be called if/when an offer
 // has already been assigned to a task but for some reason is no longer valid.
 func (t *T) Reset() {
@ -211,65 +156,6 @@ func (t *T) Reset() {
 	t.Spec = Spec{}
 }
 func (t *T) AcceptOffer(offer *mesos.Offer) bool {
 	if offer == nil {
 		return false
 	}
 	// if the user has specified a target host, make sure this offer is for that host
 	if t.Pod.Spec.NodeName != "" && offer.GetHostname() != t.Pod.Spec.NodeName {
 		return false
 	}
 	// check the NodeSelector
 	if len(t.Pod.Spec.NodeSelector) > 0 {
 		slaveLabels := map[string]string{}
 		for _, a := range offer.Attributes {
 			if a.GetType() == mesos.Value_TEXT {
 				slaveLabels[a.GetName()] = a.GetText().GetValue()
 			}
 		}
 		selector := labels.SelectorFromSet(t.Pod.Spec.NodeSelector)
 		if !selector.Matches(labels.Set(slaveLabels)) {
 			return false
 		}
 	}
 	// check ports
 	if _, err := t.mapper.Generate(t, offer); err != nil {
 		log.V(3).Info(err)
 		return false
 	}
 	// find offered cpu and mem
 	var (
 		offeredCpus mresource.CPUShares
 		offeredMem  mresource.MegaBytes
 	)
 	for _, resource := range offer.Resources {
 		if resource.GetName() == "cpus" {
 			offeredCpus = mresource.CPUShares(*resource.GetScalar().Value)
 		}
 		if resource.GetName() == "mem" {
 			offeredMem = mresource.MegaBytes(*resource.GetScalar().Value)
 		}
 	}
 	// calculate cpu and mem sum over all containers of the pod
 	// TODO (@sttts): also support pod.spec.resources.limit.request
 	// TODO (@sttts): take into account the executor resources
 	cpu := mresource.PodCPULimit(&t.Pod)
 	mem := mresource.PodMemLimit(&t.Pod)
 	log.V(4).Infof("trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
 	if (cpu > offeredCpus) || (mem > offeredMem) {
 		log.V(3).Infof("not enough resources for pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
 		return false
 	}
 	return true
 }
 func (t *T) Set(f FlagType) {
 	t.Flags[f] = struct{}{}
 	if Launched == f {
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task_test.go
@ -146,10 +146,10 @@ func TestEmptyOffer(t *testing.T) {
 	mresource.LimitPodCPU(&task.Pod, mresource.DefaultDefaultContainerCPULimit)
 	mresource.LimitPodMem(&task.Pod, mresource.DefaultDefaultContainerMemLimit)
-	if ok := task.AcceptOffer(nil); ok {
+	if ok := DefaultPredicate(task, nil); ok {
 		t.Fatalf("accepted nil offer")
 	}
-	if ok := task.AcceptOffer(&mesos.Offer{}); ok {
+	if ok := DefaultPredicate(task, &mesos.Offer{}); ok {
 		t.Fatalf("accepted empty offer")
 	}
 }
@ -176,7 +176,7 @@ func TestNoPortsInPodOrOffer(t *testing.T) {
 			mutil.NewScalarResource("mem", 0.001),
 		},
 	}
-	if ok := task.AcceptOffer(offer); ok {
+	if ok := DefaultPredicate(task, offer); ok {
 		t.Fatalf("accepted offer %v:", offer)
 	}
@ -186,7 +186,7 @@ func TestNoPortsInPodOrOffer(t *testing.T) {
 			mutil.NewScalarResource("mem", t_min_mem),
 		},
 	}
-	if ok := task.AcceptOffer(offer); !ok {
+	if ok := DefaultPredicate(task, offer); !ok {
 		t.Fatalf("did not accepted offer %v:", offer)
 	}
 }
@ -203,7 +203,7 @@ func TestAcceptOfferPorts(t *testing.T) {
 			rangeResource("ports", []uint64{1, 1}),
 		},
 	}
-	if ok := task.AcceptOffer(offer); !ok {
+	if ok := DefaultPredicate(task, offer); !ok {
 		t.Fatalf("did not accepted offer %v:", offer)
 	}
@ -218,17 +218,17 @@ func TestAcceptOfferPorts(t *testing.T) {
 	mresource.LimitPodCPU(&task.Pod, mresource.DefaultDefaultContainerCPULimit)
 	mresource.LimitPodMem(&task.Pod, mresource.DefaultDefaultContainerMemLimit)
-	if ok := task.AcceptOffer(offer); ok {
+	if ok := DefaultPredicate(task, offer); ok {
 		t.Fatalf("accepted offer %v:", offer)
 	}
 	pod.Spec.Containers[0].Ports[0].HostPort = 1
-	if ok := task.AcceptOffer(offer); !ok {
+	if ok := DefaultPredicate(task, offer); !ok {
 		t.Fatalf("did not accepted offer %v:", offer)
 	}
 	pod.Spec.Containers[0].Ports[0].HostPort = 0
-	if ok := task.AcceptOffer(offer); !ok {
+	if ok := DefaultPredicate(task, offer); !ok {
 		t.Fatalf("did not accepted offer %v:", offer)
 	}
@ -236,12 +236,12 @@ func TestAcceptOfferPorts(t *testing.T) {
 		mutil.NewScalarResource("cpus", t_min_cpu),
 		mutil.NewScalarResource("mem", t_min_mem),
 	}
-	if ok := task.AcceptOffer(offer); ok {
+	if ok := DefaultPredicate(task, offer); ok {
 		t.Fatalf("accepted offer %v:", offer)
 	}
 	pod.Spec.Containers[0].Ports[0].HostPort = 1
-	if ok := task.AcceptOffer(offer); ok {
+	if ok := DefaultPredicate(task, offer); ok {
 		t.Fatalf("accepted offer %v:", offer)
 	}
 }
@ -297,7 +297,7 @@ func TestNodeSelector(t *testing.T) {
 			},
 			Attributes: ts.attrs,
 		}
-		if got, want := task.AcceptOffer(offer), ts.ok; got != want {
+		if got, want := DefaultPredicate(task, offer), ts.ok; got != want {
 			t.Fatalf("expected acceptance of offer %v for selector %v to be %v, got %v:", want, got, ts.attrs, ts.selector)
 		}
 	}
--- a/contrib/mesos/pkg/scheduler/podtask/predicate.go
+++ b/contrib/mesos/pkg/scheduler/podtask/predicate.go
@ -0,0 +1,110 @@
 /*
 Copyright 2015 The Kubernetes Authors All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package podtask
 import (
 	log "github.com/golang/glog"
 	mesos "github.com/mesos/mesos-go/mesosproto"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
 	"k8s.io/kubernetes/pkg/labels"
 )
 var DefaultPredicate = RequireAllPredicate([]FitPredicate{
 	ValidationPredicate,
 	NodeSelectorPredicate,
 	PodFitsResourcesPredicate,
 	PortsPredicate,
 }).Fit
 // FitPredicate implementations determine if the given task "fits" into offered Mesos resources.
 // Neither the task or offer should be modified.
 type FitPredicate func(*T, *mesos.Offer) bool
 type RequireAllPredicate []FitPredicate
 func (f RequireAllPredicate) Fit(t *T, offer *mesos.Offer) bool {
 	for _, p := range f {
 		if !p(t, offer) {
 			return false
 		}
 	}
 	return true
 }
 func ValidationPredicate(t *T, offer *mesos.Offer) bool {
 	return t != nil && offer != nil
 }
 func NodeSelectorPredicate(t *T, offer *mesos.Offer) bool {
 	// if the user has specified a target host, make sure this offer is for that host
 	if t.Pod.Spec.NodeName != "" && offer.GetHostname() != t.Pod.Spec.NodeName {
 		return false
 	}
 	// check the NodeSelector
 	if len(t.Pod.Spec.NodeSelector) > 0 {
 		slaveLabels := map[string]string{}
 		for _, a := range offer.Attributes {
 			if a.GetType() == mesos.Value_TEXT {
 				slaveLabels[a.GetName()] = a.GetText().GetValue()
 			}
 		}
 		selector := labels.SelectorFromSet(t.Pod.Spec.NodeSelector)
 		if !selector.Matches(labels.Set(slaveLabels)) {
 			return false
 		}
 	}
 	return true
 }
 func PortsPredicate(t *T, offer *mesos.Offer) bool {
 	// check ports
 	if _, err := t.mapper.Generate(t, offer); err != nil {
 		log.V(3).Info(err)
 		return false
 	}
 	return true
 }
 func PodFitsResourcesPredicate(t *T, offer *mesos.Offer) bool {
 	// find offered cpu and mem
 	var (
 		offeredCpus mresource.CPUShares
 		offeredMem  mresource.MegaBytes
 	)
 	for _, resource := range offer.Resources {
 		if resource.GetName() == "cpus" {
 			offeredCpus = mresource.CPUShares(*resource.GetScalar().Value)
 		}
 		if resource.GetName() == "mem" {
 			offeredMem = mresource.MegaBytes(*resource.GetScalar().Value)
 		}
 	}
 	// calculate cpu and mem sum over all containers of the pod
 	// TODO (@sttts): also support pod.spec.resources.limit.request
 	// TODO (@sttts): take into account the executor resources
 	cpu := mresource.PodCPULimit(&t.Pod)
 	mem := mresource.PodMemLimit(&t.Pod)
 	log.V(4).Infof("trying to match offer with pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
 	if (cpu > offeredCpus) || (mem > offeredMem) {
 		log.V(3).Infof("not enough resources for pod %v/%v: cpus: %.2f mem: %.2f MB", t.Pod.Namespace, t.Pod.Name, cpu, mem)
 		return false
 	}
 	return true
 }
--- a/contrib/mesos/pkg/scheduler/podtask/procurement.go
+++ b/contrib/mesos/pkg/scheduler/podtask/procurement.go
@ -0,0 +1,152 @@
 /*
 Copyright 2015 The Kubernetes Authors All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package podtask
 import (
 	"strings"
 	log "github.com/golang/glog"
 	mesos "github.com/mesos/mesos-go/mesosproto"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
 )
 // NewDefaultProcurement returns the default procurement strategy that combines validation
 // and responsible Mesos resource procurement. c and m are resource quantities written into
 // k8s api.Pod.Spec's that don't declare resources (all containers in k8s-mesos require cpu
 // and memory limits).
 func NewDefaultProcurement(c mresource.CPUShares, m mresource.MegaBytes) Procurement {
 	requireSome := &RequireSomePodResources{
 		defaultContainerCPULimit: c,
 		defaultContainerMemLimit: m,
 	}
 	return AllOrNothingProcurement([]Procurement{
 		ValidateProcurement,
 		NodeProcurement,
 		requireSome.Procure,
 		PodResourcesProcurement,
 		PortsProcurement,
 	}).Procure
 }
 // Procurement funcs allocate resources for a task from an offer.
 // Both the task and/or offer may be modified.
 type Procurement func(*T, *mesos.Offer) error
 // AllOrNothingProcurement provides a convenient wrapper around multiple Procurement
 // objectives: the failure of any Procurement in the set results in Procure failing.
 // see AllOrNothingProcurement.Procure
 type AllOrNothingProcurement []Procurement
 // Procure runs each Procurement in the receiver list. The first Procurement func that
 // fails triggers T.Reset() and the error is returned, otherwise returns nil.
 func (a AllOrNothingProcurement) Procure(t *T, offer *mesos.Offer) error {
 	for _, p := range a {
 		if err := p(t, offer); err != nil {
 			t.Reset()
 			return err
 		}
 	}
 	return nil
 }
 // ValidateProcurement checks that the offered resources are kosher, and if not panics.
 // If things check out ok, t.Spec is cleared and nil is returned.
 func ValidateProcurement(t *T, offer *mesos.Offer) error {
 	if offer == nil {
 		//programming error
 		panic("offer details are nil")
 	}
 	t.Spec = Spec{}
 	return nil
 }
 // NodeProcurement updates t.Spec in preparation for the task to be launched on the
 // slave associated with the offer.
 func NodeProcurement(t *T, offer *mesos.Offer) error {
 	t.Spec.SlaveID = offer.GetSlaveId().GetValue()
 	t.Spec.AssignedSlave = offer.GetHostname()
 	// hostname needs of the executor needs to match that of the offer, otherwise
 	// the kubelet node status checker/updater is very unhappy
 	const HOSTNAME_OVERRIDE_FLAG = "--hostname-override="
 	hostname := offer.GetHostname() // required field, non-empty
 	hostnameOverride := HOSTNAME_OVERRIDE_FLAG + hostname
 	argv := t.executor.Command.Arguments
 	overwrite := false
 	for i, arg := range argv {
 		if strings.HasPrefix(arg, HOSTNAME_OVERRIDE_FLAG) {
 			overwrite = true
 			argv[i] = hostnameOverride
 			break
 		}
 	}
 	if !overwrite {
 		t.executor.Command.Arguments = append(argv, hostnameOverride)
 	}
 	return nil
 }
 type RequireSomePodResources struct {
 	defaultContainerCPULimit mresource.CPUShares
 	defaultContainerMemLimit mresource.MegaBytes
 }
 func (r *RequireSomePodResources) Procure(t *T, offer *mesos.Offer) error {
 	// write resource limits into the pod spec which is transferred to the executor. From here
 	// on we can expect that the pod spec of a task has proper limits for CPU and memory.
 	// TODO(sttts): For a later separation of the kubelet and the executor also patch the pod on the apiserver
 	// TODO(jdef): changing the state of t.Pod here feels dirty, especially since we don't use a kosher
 	// method to clone the api.Pod state in T.Clone(). This needs some love.
 	if unlimitedCPU := mresource.LimitPodCPU(&t.Pod, r.defaultContainerCPULimit); unlimitedCPU {
 		log.Warningf("Pod %s/%s without cpu limits is admitted %.2f cpu shares", t.Pod.Namespace, t.Pod.Name, mresource.PodCPULimit(&t.Pod))
 	}
 	if unlimitedMem := mresource.LimitPodMem(&t.Pod, r.defaultContainerMemLimit); unlimitedMem {
 		log.Warningf("Pod %s/%s without memory limits is admitted %.2f MB", t.Pod.Namespace, t.Pod.Name, mresource.PodMemLimit(&t.Pod))
 	}
 	return nil
 }
 // PodResourcesProcurement converts k8s pod cpu and memory resource requirements into
 // mesos resource allocations.
 func PodResourcesProcurement(t *T, offer *mesos.Offer) error {
 	// compute used resources
 	cpu := mresource.PodCPULimit(&t.Pod)
 	mem := mresource.PodMemLimit(&t.Pod)
 	log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", offer.Id, t.Pod.Namespace, t.Pod.Name, cpu, mem)
 	t.Spec.CPU = cpu
 	t.Spec.Memory = mem
 	return nil
 }
 // PortsProcurement convert host port mappings into mesos port resource allocations.
 func PortsProcurement(t *T, offer *mesos.Offer) error {
 	// fill in port mapping
 	if mapping, err := t.mapper.Generate(t, offer); err != nil {
 		return err
 	} else {
 		ports := []uint64{}
 		for _, entry := range mapping {
 			ports = append(ports, entry.OfferPort)
 		}
 		t.Spec.PortMap = mapping
 		t.Spec.Ports = ports
 	}
 	return nil
 }
--- a/contrib/mesos/pkg/scheduler/scheduler.go
+++ b/contrib/mesos/pkg/scheduler/scheduler.go
@ -39,7 +39,6 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/errors"
@ -118,19 +117,17 @@ type KubernetesScheduler struct {
 	// and the invoking the pod registry interfaces.
 	// In particular, changes to podtask.T objects are currently guarded by this lock.
 	*sync.RWMutex
 	PodScheduler
 	// Config related, write-once
 	schedcfg          *schedcfg.Config
 	executor          *mesos.ExecutorInfo
 	executorGroup     uint64
 	scheduleFunc             PodScheduleFunc
 	client            *client.Client
 	etcdClient        tools.EtcdClient
 	failoverTimeout   float64 // in seconds
 	reconcileInterval int64
 	defaultContainerCPULimit mresource.CPUShares
 	defaultContainerMemLimit mresource.MegaBytes
 	// Mesos context.
@ -159,14 +156,12 @@ type KubernetesScheduler struct {
 type Config struct {
 	Schedcfg          schedcfg.Config
 	Executor          *mesos.ExecutorInfo
-	ScheduleFunc             PodScheduleFunc
+	Scheduler         PodScheduler
 	Client            *client.Client
 	EtcdClient        tools.EtcdClient
 	FailoverTimeout   float64
 	ReconcileInterval int64
 	ReconcileCooldown time.Duration
 	DefaultContainerCPULimit mresource.CPUShares
 	DefaultContainerMemLimit mresource.MegaBytes
 }
 // New creates a new KubernetesScheduler
@ -177,13 +172,11 @@ func New(config Config) *KubernetesScheduler {
 		RWMutex:           new(sync.RWMutex),
 		executor:          config.Executor,
 		executorGroup:     uid.Parse(config.Executor.ExecutorId.GetValue()).Group(),
-		scheduleFunc:             config.ScheduleFunc,
+		PodScheduler:      config.Scheduler,
 		client:            config.Client,
 		etcdClient:        config.EtcdClient,
 		failoverTimeout:   config.FailoverTimeout,
 		reconcileInterval: config.ReconcileInterval,
 		defaultContainerCPULimit: config.DefaultContainerCPULimit,
 		defaultContainerMemLimit: config.DefaultContainerMemLimit,
 		offers: offers.CreateRegistry(offers.RegistryConfig{
 			Compat: func(o *mesos.Offer) bool {
 				// filter the offers: the executor IDs must not identify a kubelet-
--- a/contrib/mesos/pkg/scheduler/service/service.go
+++ b/contrib/mesos/pkg/scheduler/service/service.go
@ -57,6 +57,7 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/ha"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/metrics"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 	mresource "k8s.io/kubernetes/contrib/mesos/pkg/scheduler/resource"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/uid"
 	"k8s.io/kubernetes/pkg/api"
@ -139,6 +140,8 @@ type SchedulerServer struct {
 	KubeletNetworkPluginName      string
 	StaticPodsConfigPath          string
 	DockerCfgPath                 string
 	ContainPodResources           bool
 	AccountForPodResources        bool
 	executable  string // path to the binary running this service
 	client      *client.Client
@ -182,6 +185,8 @@ func NewSchedulerServer() *SchedulerServer {
 		mux:                    http.NewServeMux(),
 		KubeletCadvisorPort:    4194, // copied from github.com/GoogleCloudPlatform/kubernetes/blob/release-0.14/cmd/kubelet/app/server.go
 		KubeletSyncFrequency:   10 * time.Second,
 		ContainPodResources:    true,
 		AccountForPodResources: true,
 	}
 	// cache this for later use. also useful in case the original binary gets deleted, e.g.
 	// during upgrades, development deployments, etc.
@ -231,6 +236,8 @@ func (s *SchedulerServer) addCoreFlags(fs *pflag.FlagSet) {
 	fs.IPVar(&s.ServiceAddress, "service-address", s.ServiceAddress, "The service portal IP address that the scheduler should register with (if unset, chooses randomly)")
 	fs.Var(&s.DefaultContainerCPULimit, "default-container-cpu-limit", "Containers without a CPU resource limit are admitted this much CPU shares")
 	fs.Var(&s.DefaultContainerMemLimit, "default-container-mem-limit", "Containers without a memory resource limit are admitted this much amount of memory in MB")
 	fs.BoolVar(&s.ContainPodResources, "contain-pod-resources", s.ContainPodResources, "Reparent pod containers into mesos cgroups; disable if you're having strange mesos/docker/systemd interactions.")
 	fs.BoolVar(&s.AccountForPodResources, "account-for-pod-resources", s.AccountForPodResources, "Allocate pod CPU and memory resources from offers (Default: true)")
 	fs.IntVar(&s.ExecutorLogV, "executor-logv", s.ExecutorLogV, "Logging verbosity of spawned minion and executor processes.")
 	fs.BoolVar(&s.ExecutorBindall, "executor-bindall", s.ExecutorBindall, "When true will set -address of the executor to 0.0.0.0.")
@ -367,6 +374,7 @@ func (s *SchedulerServer) prepareExecutorInfo(hks hyperkube.Interface) (*mesos.E
 	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--mesos-cgroup-prefix=%v", s.MesosCgroupPrefix))
 	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--cadvisor-port=%v", s.KubeletCadvisorPort))
 	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--sync-frequency=%v", s.KubeletSyncFrequency))
 	ci.Arguments = append(ci.Arguments, fmt.Sprintf("--contain-pod-resources=%t", s.ContainPodResources))
 	if s.AuthPath != "" {
 		//TODO(jdef) should probably support non-local files, e.g. hdfs:///some/config/file
@ -651,17 +659,27 @@ func (s *SchedulerServer) bootstrap(hks hyperkube.Interface, sc *schedcfg.Config
 		log.Fatalf("misconfigured etcd: %v", err)
 	}
 	as := scheduler.NewAllocationStrategy(
 		podtask.DefaultPredicate,
 		podtask.NewDefaultProcurement(s.DefaultContainerCPULimit, s.DefaultContainerMemLimit))
 	// downgrade allocation strategy if user disables "account-for-pod-resources"
 	if !s.AccountForPodResources {
 		as = scheduler.NewAllocationStrategy(
 			podtask.DefaultMinimalPredicate,
 			podtask.DefaultMinimalProcurement)
 	}
 	fcfs := scheduler.NewFCFSPodScheduler(as)
 	mesosPodScheduler := scheduler.New(scheduler.Config{
 		Schedcfg:          *sc,
 		Executor:          executor,
-		ScheduleFunc:             scheduler.FCFSScheduleFunc,
+		Scheduler:         fcfs,
 		Client:            client,
 		EtcdClient:        etcdClient,
 		FailoverTimeout:   s.FailoverTimeout,
 		ReconcileInterval: s.ReconcileInterval,
 		ReconcileCooldown: s.ReconcileCooldown,
 		DefaultContainerCPULimit: s.DefaultContainerCPULimit,
 		DefaultContainerMemLimit: s.DefaultContainerMemLimit,
 	})
 	masterUri := s.MesosMaster
--- a/contrib/mesos/pkg/scheduler/types.go
+++ b/contrib/mesos/pkg/scheduler/types.go
@ -23,17 +23,29 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/podtask"
 )
-// PodScheduleFunc implements how to schedule pods among slaves.
+type AllocationStrategy interface {
 	// FitPredicate returns the selector used to determine pod fitness w/ respect to a given offer
 	FitPredicate() podtask.FitPredicate
 	// Procurement returns a func that obtains resources for a task from resource offer
 	Procurement() podtask.Procurement
 }
 type PodScheduler interface {
 	AllocationStrategy
 	// SchedulePod implements how to schedule pods among slaves.
 	// We can have different implementation for different scheduling policy.
 	//
-// The Schedule function accepts a group of slaves (each contains offers from
+	// The function accepts a group of slaves (each contains offers from
 	// that slave) and a single pod, which aligns well with the k8s scheduling
 	// algorithm. It returns an offerId that is acceptable for the pod, otherwise
 	// nil. The caller is responsible for filling in task state w/ relevant offer
 	// details.
 	//
-// See the FCFSScheduleFunc for example.
+	// See the FCFSPodScheduler for example.
-type PodScheduleFunc func(r offers.Registry, slaves SlaveIndex, task *podtask.T) (offers.Perishable, error)
+	SchedulePod(r offers.Registry, slaves SlaveIndex, task *podtask.T) (offers.Perishable, error)
 }
 // A minimal placeholder
 type empty struct{}
--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@ -1,5 +1,6 @@
 accept-hosts
 accept-paths
 account-for-pod-resources
 admission-control
 admission-control-config-file
 advertise-address
@ -43,6 +44,7 @@ cluster-name
 cluster-tag
 concurrent-endpoint-syncs
 configure-cbr0
 contain-pod-resources
 container-port
 container-runtime
 cors-allowed-origins