Merge pull request #52 from mesosphere/sttts-task-lost-during-kubelet-pod-launch

MESOS: scheduler: handle lost task status updates during kubelet pod launch
This commit is contained in:
Dr. Stefan Schimanski 2015-12-02 13:00:21 +01:00
parent e28404b23b
commit 271eeb008b
3 changed files with 5 additions and 2 deletions

View File

@ -700,7 +700,7 @@ waitForRunningPod:
k.lock.Lock()
defer k.lock.Unlock()
reportLost:
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
k.reportLostTask(driver, taskId, messages.KubeletPodLaunchFailed)
}
func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {

View File

@ -25,6 +25,7 @@ const (
ExecutorUnregistered = "executor-unregistered"
ExecutorShutdown = "executor-shutdown"
LaunchTaskFailed = "launch-task-failed"
KubeletPodLaunchFailed = "kubelet-pod-launch-failed"
TaskKilled = "task-killed"
TaskLost = "task-lost"
UnmarshalTaskDataFailure = "unmarshal-task-data-failure"

View File

@ -482,13 +482,15 @@ func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskS
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.KubeletPodLaunchFailed) ||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) {
//--
// pod-task has metadata that refers to:
// (1) a task that Mesos no longer knows about, or else
// (2) a pod that the Kubelet will never report as "failed"
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
// (4) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
// (4) a pod that the kubeletExecutor reported as lost because the kubelet didn't manage to launch it (in time)
// (5) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
// For now, destroy the pod and hope that there's a replication controller backing it up.
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
pod := &task.Pod