Merge pull request #12535 from mesosphere/persist-assigned-slave

Make slave assignment before binding persistent
2025-07-29 06:27:05 +00:00 · 2015-08-12 10:14:38 -07:00 · 2015-08-12 10:14:38 -07:00 · 4163d1a5d2
commit 4163d1a5d2
parent 6cff082918 fb0d4dbc13
4 changed files with 87 additions and 72 deletions
--- a/contrib/mesos/pkg/scheduler/plugin.go
+++ b/contrib/mesos/pkg/scheduler/plugin.go
@ -198,8 +198,12 @@ func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *pod
 	oemCt := pod.Spec.Containers
 	pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod
-	annotateForExecutorOnSlave(&pod, machine)
+	if pod.Annotations == nil {
 		pod.Annotations = make(map[string]string)
 	}
 	task.SaveRecoveryInfo(pod.Annotations)
 	pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave
 	for _, entry := range task.Spec.PortMap {
 		oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
@ -233,29 +237,13 @@ type kubeScheduler struct {
 	defaultContainerMemLimit mresource.MegaBytes
 }
-// annotatedForExecutor checks whether a pod is assigned to a Mesos slave, and
+// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
-// possibly already launched. It can, but doesn't have to be scheduled already
+// the BindingHostKey. For tasks in the registry of the scheduler, the same
-// in the sense of kubernetes, i.e. the NodeName field might still be empty.
+// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
-func annotatedForExecutor(pod *api.Pod) bool {
+// annotation is added and the executor will eventually persist that to the
-	_, ok := pod.ObjectMeta.Annotations[annotation.BindingHostKey]
+// apiserver on binding.
-	return ok
+func recoverAssignedSlave(pod *api.Pod) string {
-}
+	return pod.Annotations[annotation.BindingHostKey]
 // annotateForExecutorOnSlave sets the BindingHostKey annotation which
 // marks the pod to be processed by the scheduler and launched as a Mesos
 // task. The executor on the slave will to the final binding to finish the
 // scheduling in the kubernetes sense.
 func annotateForExecutorOnSlave(pod *api.Pod, slave string) {
 	if pod.Annotations == nil {
 		pod.Annotations = make(map[string]string)
 	} else {
 		oemAnn := pod.Annotations
 		pod.Annotations = make(map[string]string)
 		for k, v := range oemAnn {
 			pod.Annotations[k] = v
 		}
 	}
 	pod.Annotations[annotation.BindingHostKey] = slave
 }
 // Schedule implements the Scheduler interface of Kubernetes.
@ -462,7 +450,7 @@ func (q *queuer) Run(done <-chan struct{}) {
 			}
 			pod := p.(*Pod)
-			if annotatedForExecutor(pod.Pod) {
+			if recoverAssignedSlave(pod.Pod) != "" {
 				log.V(3).Infof("dequeuing pod for scheduling: %v", pod.Pod.Name)
 				q.dequeue(pod.GetUID())
 			} else {
@ -511,7 +499,7 @@ func (q *queuer) yield() *api.Pod {
 			log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
 		} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
 			log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
-		} else if annotatedForExecutor(pod) {
+		} else if recoverAssignedSlave(pod) != "" {
 			// should never happen if enqueuePods is filtering properly
 			log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
 		} else {
@ -801,25 +789,27 @@ func (s *schedulingPlugin) scheduleOne() {
 //      host="..." |  host="..."    ; perhaps no updates to process?
 //
 // TODO(jdef) this needs an integration test
-func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) {
+func (s *schedulingPlugin) reconcileTask(t *podtask.T) {
-	log.V(1).Infof("reconcile pod %v", oldPod.Name)
+	log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
-	ctx := api.WithNamespace(api.NewDefaultContext(), oldPod.Namespace)
+	ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
-	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(oldPod.Name)
+	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
 	if err != nil {
 		if errors.IsNotFound(err) {
 			// attempt to delete
-			if err = s.deleter.deleteOne(&Pod{Pod: &oldPod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
+			if err = s.deleter.deleteOne(&Pod{Pod: &t.Pod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
-				log.Errorf("failed to delete pod: %v: %v", oldPod.Name, err)
+				log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
 			}
 		} else {
 			//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
 			//For now, drop the pod on the floor
-			log.Warning("aborting reconciliation for pod %v: %v", oldPod.Name, err)
+			log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
 		}
 		return
 	}
-	if oldPod.Spec.NodeName != pod.Spec.NodeName {
+
-		if annotatedForExecutor(pod) {
+	log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
 	if t.Spec.AssignedSlave != pod.Spec.NodeName {
 		if pod.Spec.NodeName == "" {
 			// pod is unscheduled.
 			// it's possible that we dropped the pod in the scheduler error handler
 			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
--- a/contrib/mesos/pkg/scheduler/plugin_test.go
+++ b/contrib/mesos/pkg/scheduler/plugin_test.go
@ -199,7 +199,7 @@ func NewTestPod() (*api.Pod, int) {
 		TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
 		ObjectMeta: api.ObjectMeta{
 			Name:      name,
-			Namespace: "default",
+			Namespace: api.NamespaceDefault,
 			SelfLink:  fmt.Sprintf("http://1.2.3.4/api/v1beta1/pods/%s", name),
 		},
 		Spec: api.PodSpec{
@ -418,7 +418,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	c.Recorder = eventObserver
 	// create plugin
-	p := NewPlugin(c)
+	p := NewPlugin(c).(*schedulingPlugin)
 	assert.NotNil(p)
 	// run plugin
@ -514,11 +514,8 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		t.Fatalf("timed out waiting for launchTasks call")
 	}
-	// define generic pod startup
+	// Launch a pod and wait until the scheduler driver is called
-	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
+	schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		// notify watchers of new pod
 		podListWatch.Add(pod, true)
 		// wait for failedScheduling event because there is no offer
 		assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")
@ -531,8 +528,6 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		// wait for driver.launchTasks call
 		select {
 		case launchedTask := <-launchedTasks:
 			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
 			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))
 			for _, offer := range offers {
 				if offer.Id.GetValue() == launchedTask.offerId.GetValue() {
 					return pod, &launchedTask, offer
@ -540,12 +535,30 @@ func TestPlugin_LifeCycle(t *testing.T) {
 			}
 			t.Fatalf("unknown offer used to start a pod")
 			return nil, nil, nil
 		case <-time.After(5 * time.Second):
 			t.Fatal("timed out waiting for launchTasks")
 			return nil, nil, nil
 		}
 	}
 	// Launch a pod and wait until the scheduler driver is called
 	launchPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		podListWatch.Add(pod, true)
 		return schedulePodWithOffers(pod, offers)
 	}
 	// Launch a pod, wait until the scheduler driver is called and report back that it is running
 	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		// notify about pod, offer resources and wait for scheduling
 		pod, launchedTask, offer := launchPodWithOffers(pod, offers)
 		if pod != nil {
 			// report back status
 			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
 			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))
 			return pod, launchedTask, offer
 		}
 		return nil, nil, nil
 	}
 	startTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		pod, i := NewTestPod()
@ -610,31 +623,42 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		// wait until pod is looked up at the apiserver
 		assertext.EventuallyTrue(t, time.Second, func() bool {
 			return testApiServer.Stats(pod.Name) == beforePodLookups+1
-		}, "expect that reconcilePod will access apiserver for pod %v", pod.Name)
+		}, "expect that reconcileTask will access apiserver for pod %v", pod.Name)
 	}
 	launchTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		pod, i := NewTestPod()
 		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
 		return launchPodWithOffers(pod, offers)
 	}
 	// 1. with pod deleted from the apiserver
-	pod, launchedTask, _ = startTestPod()
+	//    expected: pod is removed from internal task registry
 	pod, launchedTask, _ = launchTestPod()
 	podListWatch.Delete(pod, false) // not notifying the watchers
 	failPodFromExecutor(launchedTask.taskInfo)
 	podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
 	assertext.EventuallyTrue(t, time.Second, func() bool {
 		t, _ := p.api.tasks().ForPod(podKey)
 		return t == nil
 	})
 	// 2. with pod still on the apiserver, not bound
-	pod, launchedTask, _ = startTestPod()
+	//    expected: pod is rescheduled
 	pod, launchedTask, _ = launchTestPod()
 	failPodFromExecutor(launchedTask.taskInfo)
-	// 3. with pod still on the apiserver, bound i.e. host!=""
+	retryOffers := []*mesos.Offer{NewTestOffer("retry-offer")}
-	pod, launchedTask, usedOffer = startTestPod()
+	schedulePodWithOffers(pod, retryOffers)
-	pod.Annotations = map[string]string{
+
-		meta.BindingHostKey: *usedOffer.Hostname,
+	// 3. with pod still on the apiserver, bound, notified via ListWatch
-	}
+	// expected: nothing, pod updates not supported, compare ReconcileTask function
 	podListWatch.Modify(pod, false) // not notifying the watchers
 	failPodFromExecutor(launchedTask.taskInfo)
 	// 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch
 	pod, launchedTask, usedOffer = startTestPod()
 	pod.Annotations = map[string]string{
 		meta.BindingHostKey: *usedOffer.Hostname,
 	}
 	pod.Spec.NodeName = *usedOffer.Hostname
 	podListWatch.Modify(pod, true) // notifying the watchers
 	time.Sleep(time.Second / 2)
 	failPodFromExecutor(launchedTask.taskInfo)
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go
@ -71,12 +71,13 @@ type T struct {
 }
 type Spec struct {
-	SlaveID string
+	SlaveID       string
-	CPU     mresource.CPUShares
+	AssignedSlave string
-	Memory  mresource.MegaBytes
+	CPU           mresource.CPUShares
-	PortMap []HostPortMapping
+	Memory        mresource.MegaBytes
-	Ports   []uint64
+	PortMap       []HostPortMapping
-	Data    []byte
+	Ports         []uint64
 	Data          []byte
 }
 // mostly-clone this pod task. the clone will actually share the some fields:
@ -161,9 +162,10 @@ func (t *T) FillFromDetails(details *mesos.Offer) error {
 	log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", details.Id, t.Pod.Namespace, t.Pod.Name, cpu, mem)
 	t.Spec = Spec{
-		SlaveID: details.GetSlaveId().GetValue(),
+		SlaveID:       details.GetSlaveId().GetValue(),
-		CPU:     cpu,
+		AssignedSlave: details.GetHostname(),
-		Memory:  mem,
+		CPU:           cpu,
 		Memory:        mem,
 	}
 	// fill in port mapping
@ -346,8 +348,7 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 		bindTime:   now,
 	}
 	var (
-		offerId  string
+		offerId string
 		hostname string
 	)
 	for _, k := range []string{
 		annotation.BindingHostKey,
@ -362,7 +363,7 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 		}
 		switch k {
 		case annotation.BindingHostKey:
-			hostname = v
+			t.Spec.AssignedSlave = v
 		case annotation.SlaveIdKey:
 			t.Spec.SlaveID = v
 		case annotation.OfferIdKey:
@ -375,7 +376,7 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 			t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
 		}
 	}
-	t.Offer = offers.Expired(offerId, hostname, 0)
+	t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)
 	t.Flags[Launched] = struct{}{}
 	t.Flags[Bound] = struct{}{}
 	return t, true, nil
--- a/contrib/mesos/pkg/scheduler/scheduler.go
+++ b/contrib/mesos/pkg/scheduler/scheduler.go
@ -102,7 +102,7 @@ func (self *slaveStorage) getSlave(slaveId string) (*Slave, bool) {
 type PluginInterface interface {
 	// the apiserver may have a different state for the pod than we do
 	// so reconcile our records, but only for this one pod
-	reconcilePod(api.Pod)
+	reconcileTask(*podtask.T)
 	// execute the Scheduling plugin, should start a go routine and return immediately
 	Run(<-chan struct{})
@ -432,7 +432,7 @@ func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, task
 	case mesos.TaskState_TASK_FAILED:
 		if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
 			if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
-				go k.plugin.reconcilePod(task.Pod)
+				go k.plugin.reconcileTask(task)
 				return
 			}
 		} else {