Merge pull request #52 from mesosphere/sttts-task-lost-during-kubelet-pod-launch

MESOS: scheduler: handle lost task status updates during kubelet pod launch
2025-09-14 13:45:06 +00:00 · 2015-12-02 13:00:21 +01:00
parent e28404b23b
commit 271eeb008b
3 changed files with 5 additions and 2 deletions
--- a/contrib/mesos/pkg/executor/executor.go
+++ b/contrib/mesos/pkg/executor/executor.go
@@ -700,7 +700,7 @@ waitForRunningPod:
 	k.lock.Lock()
 	defer k.lock.Unlock()
 reportLost:
-	k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
+	k.reportLostTask(driver, taskId, messages.KubeletPodLaunchFailed)
 }

 func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
--- a/contrib/mesos/pkg/executor/messages/messages.go
+++ b/contrib/mesos/pkg/executor/messages/messages.go
@@ -25,6 +25,7 @@ const (
 	ExecutorUnregistered     = "executor-unregistered"
 	ExecutorShutdown         = "executor-shutdown"
 	LaunchTaskFailed         = "launch-task-failed"
+	KubeletPodLaunchFailed   = "kubelet-pod-launch-failed"
 	TaskKilled               = "task-killed"
 	TaskLost                 = "task-lost"
 	UnmarshalTaskDataFailure = "unmarshal-task-data-failure"
--- a/contrib/mesos/pkg/scheduler/components/framework/framework.go
+++ b/contrib/mesos/pkg/scheduler/components/framework/framework.go
@@ -482,13 +482,15 @@ func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskS
 			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
 			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
 			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) ||
+			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.KubeletPodLaunchFailed) ||
 			(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) {
 		//--
 		// pod-task has metadata that refers to:
 		// (1) a task that Mesos no longer knows about, or else
 		// (2) a pod that the Kubelet will never report as "failed"
 		// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
-		// (4) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
+		// (4) a pod that the kubeletExecutor reported as lost because the kubelet didn't manage to launch it (in time)
+		// (5) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
 		// For now, destroy the pod and hope that there's a replication controller backing it up.
 		// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
 		pod := &task.Pod