mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-04 18:00:08 +00:00
Merge pull request #52 from mesosphere/sttts-task-lost-during-kubelet-pod-launch
MESOS: scheduler: handle lost task status updates during kubelet pod launch
This commit is contained in:
parent
e28404b23b
commit
271eeb008b
@ -700,7 +700,7 @@ waitForRunningPod:
|
|||||||
k.lock.Lock()
|
k.lock.Lock()
|
||||||
defer k.lock.Unlock()
|
defer k.lock.Unlock()
|
||||||
reportLost:
|
reportLost:
|
||||||
k.reportLostTask(driver, taskId, messages.LaunchTaskFailed)
|
k.reportLostTask(driver, taskId, messages.KubeletPodLaunchFailed)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
|
func (k *Executor) __launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
|
||||||
|
@ -25,6 +25,7 @@ const (
|
|||||||
ExecutorUnregistered = "executor-unregistered"
|
ExecutorUnregistered = "executor-unregistered"
|
||||||
ExecutorShutdown = "executor-shutdown"
|
ExecutorShutdown = "executor-shutdown"
|
||||||
LaunchTaskFailed = "launch-task-failed"
|
LaunchTaskFailed = "launch-task-failed"
|
||||||
|
KubeletPodLaunchFailed = "kubelet-pod-launch-failed"
|
||||||
TaskKilled = "task-killed"
|
TaskKilled = "task-killed"
|
||||||
TaskLost = "task-lost"
|
TaskLost = "task-lost"
|
||||||
UnmarshalTaskDataFailure = "unmarshal-task-data-failure"
|
UnmarshalTaskDataFailure = "unmarshal-task-data-failure"
|
||||||
|
@ -482,13 +482,15 @@ func (k *framework) reconcileTerminalTask(driver bindings.SchedulerDriver, taskS
|
|||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_TERMINATED) ||
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_SLAVE && taskStatus.GetReason() == mesos.TaskStatus_REASON_EXECUTOR_UNREGISTERED) ||
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) ||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.ContainersDisappeared) ||
|
||||||
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.KubeletPodLaunchFailed) ||
|
||||||
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) {
|
(taskStatus.GetSource() == mesos.TaskStatus_SOURCE_EXECUTOR && taskStatus.GetMessage() == messages.TaskKilled && !task.Has(podtask.Deleted))) {
|
||||||
//--
|
//--
|
||||||
// pod-task has metadata that refers to:
|
// pod-task has metadata that refers to:
|
||||||
// (1) a task that Mesos no longer knows about, or else
|
// (1) a task that Mesos no longer knows about, or else
|
||||||
// (2) a pod that the Kubelet will never report as "failed"
|
// (2) a pod that the Kubelet will never report as "failed"
|
||||||
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
|
// (3) a pod that the kubeletExecutor reported as lost (likely due to docker daemon crash/restart)
|
||||||
// (4) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
|
// (4) a pod that the kubeletExecutor reported as lost because the kubelet didn't manage to launch it (in time)
|
||||||
|
// (5) a pod that the kubeletExecutor killed, but the scheduler didn't ask for that (maybe killed by the master)
|
||||||
// For now, destroy the pod and hope that there's a replication controller backing it up.
|
// For now, destroy the pod and hope that there's a replication controller backing it up.
|
||||||
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
|
// TODO(jdef) for case #2 don't delete the pod, just update it's status to Failed
|
||||||
pod := &task.Pod
|
pod := &task.Pod
|
||||||
|
Loading…
Reference in New Issue
Block a user