diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index cb8af85f3e3..2f53e4faeb0 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -79,7 +79,10 @@ type Runtime interface { SyncPod(pod *api.Pod, apiPodStatus api.PodStatus, podStatus *PodStatus, pullSecrets []api.Secret, backOff *flowcontrol.Backoff) PodSyncResult // KillPod kills all the containers of a pod. Pod may be nil, running pod must not be. // TODO(random-liu): Return PodSyncResult in KillPod. - KillPod(pod *api.Pod, runningPod Pod) error + // gracePeriodOverride if specified allows the caller to override the pod default grace period. + // only hard kill paths are allowed to specify a gracePeriodOverride in the kubelet in order to not corrupt user data. + // it is useful when doing SIGKILL for hard eviction scenarios, or max grace period during soft eviction scenarios. + KillPod(pod *api.Pod, runningPod Pod, gracePeriodOverride *int64) error // GetPodStatus retrieves the status of the pod, including the // information of all containers in the pod that are visble in Runtime. GetPodStatus(uid types.UID, name, namespace string) (*PodStatus, error) diff --git a/pkg/kubelet/container/testing/fake_runtime.go b/pkg/kubelet/container/testing/fake_runtime.go index ce38cd2d65e..b73e195ce6f 100644 --- a/pkg/kubelet/container/testing/fake_runtime.go +++ b/pkg/kubelet/container/testing/fake_runtime.go @@ -205,7 +205,7 @@ func (f *FakeRuntime) SyncPod(pod *api.Pod, _ api.PodStatus, _ *PodStatus, _ []a return } -func (f *FakeRuntime) KillPod(pod *api.Pod, runningPod Pod) error { +func (f *FakeRuntime) KillPod(pod *api.Pod, runningPod Pod, gracePeriodOverride *int64) error { f.Lock() defer f.Unlock() diff --git a/pkg/kubelet/container/testing/runtime_mock.go b/pkg/kubelet/container/testing/runtime_mock.go index 496e3222ac4..f2fe6cba1fd 100644 --- a/pkg/kubelet/container/testing/runtime_mock.go +++ b/pkg/kubelet/container/testing/runtime_mock.go @@ -68,8 +68,8 @@ func (r *Mock) SyncPod(pod *api.Pod, apiStatus api.PodStatus, status *PodStatus, return args.Get(0).(PodSyncResult) } -func (r *Mock) KillPod(pod *api.Pod, runningPod Pod) error { - args := r.Called(pod, runningPod) +func (r *Mock) KillPod(pod *api.Pod, runningPod Pod, gracePeriodOverride *int64) error { + args := r.Called(pod, runningPod, gracePeriodOverride) return args.Error(0) } diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index 730f56c27f7..6f24ec679fa 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -1169,14 +1169,15 @@ func (dm *DockerManager) GetContainerIP(containerID, interfaceName string) (stri // We can only deprecate this after refactoring kubelet. // TODO(random-liu): After using pod status for KillPod(), we can also remove the kubernetesPodLabel, because all the needed information should have // been extract from new labels and stored in pod status. -func (dm *DockerManager) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { - result := dm.killPodWithSyncResult(pod, runningPod) +// only hard eviction scenarios should provide a grace period override, all other code paths must pass nil. +func (dm *DockerManager) KillPod(pod *api.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error { + result := dm.killPodWithSyncResult(pod, runningPod, gracePeriodOverride) return result.Error() } // TODO(random-liu): This is just a temporary function, will be removed when we acturally add PodSyncResult // NOTE(random-liu): The pod passed in could be *nil* when kubelet restarted. -func (dm *DockerManager) killPodWithSyncResult(pod *api.Pod, runningPod kubecontainer.Pod) (result kubecontainer.PodSyncResult) { +func (dm *DockerManager) killPodWithSyncResult(pod *api.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) { // Send the kills in parallel since they may take a long time. // There may be len(runningPod.Containers) or len(runningPod.Containers)-1 of result in the channel containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers)) @@ -1212,7 +1213,7 @@ func (dm *DockerManager) killPodWithSyncResult(pod *api.Pod, runningPod kubecont } killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name) - err := dm.KillContainerInPod(container.ID, containerSpec, pod, "Need to kill pod.") + err := dm.KillContainerInPod(container.ID, containerSpec, pod, "Need to kill pod.", gracePeriodOverride) if err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID) @@ -1245,7 +1246,7 @@ func (dm *DockerManager) killPodWithSyncResult(pod *api.Pod, runningPod kubecont } killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, networkContainer.Name) result.AddSyncResult(killContainerResult) - if err := dm.KillContainerInPod(networkContainer.ID, networkSpec, pod, "Need to kill pod."); err != nil { + if err := dm.KillContainerInPod(networkContainer.ID, networkSpec, pod, "Need to kill pod.", gracePeriodOverride); err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID) } @@ -1255,7 +1256,7 @@ func (dm *DockerManager) killPodWithSyncResult(pod *api.Pod, runningPod kubecont // KillContainerInPod kills a container in the pod. It must be passed either a container ID or a container and pod, // and will attempt to lookup the other information if missing. -func (dm *DockerManager) KillContainerInPod(containerID kubecontainer.ContainerID, container *api.Container, pod *api.Pod, message string) error { +func (dm *DockerManager) KillContainerInPod(containerID kubecontainer.ContainerID, container *api.Container, pod *api.Pod, message string, gracePeriodOverride *int64) error { switch { case containerID.IsEmpty(): // Locate the container. @@ -1287,12 +1288,14 @@ func (dm *DockerManager) KillContainerInPod(containerID kubecontainer.ContainerI pod = storedPod } } - return dm.killContainer(containerID, container, pod, message) + return dm.killContainer(containerID, container, pod, message, gracePeriodOverride) } // killContainer accepts a containerID and an optional container or pod containing shutdown policies. Invoke -// KillContainerInPod if information must be retrieved first. -func (dm *DockerManager) killContainer(containerID kubecontainer.ContainerID, container *api.Container, pod *api.Pod, reason string) error { +// KillContainerInPod if information must be retrieved first. It is only valid to provide a grace period override +// during hard eviction scenarios. All other code paths in kubelet must never provide a grace period override otherwise +// data corruption could occur in the end-user application. +func (dm *DockerManager) killContainer(containerID kubecontainer.ContainerID, container *api.Container, pod *api.Pod, reason string, gracePeriodOverride *int64) error { ID := containerID.ID name := ID if container != nil { @@ -1333,10 +1336,20 @@ func (dm *DockerManager) killContainer(containerID kubecontainer.ContainerID, co gracePeriod -= int64(unversioned.Now().Sub(start.Time).Seconds()) } - // always give containers a minimal shutdown window to avoid unnecessary SIGKILLs - if gracePeriod < minimumGracePeriodInSeconds { - gracePeriod = minimumGracePeriodInSeconds + // if the caller did not specify a grace period override, we ensure that the grace period + // is not less than the minimal shutdown window to avoid unnecessary SIGKILLs. if a caller + // did provide an override, we always set the gracePeriod to that value. the only valid + // time to send an override is during eviction scenarios where we want to do a hard kill of + // a container because of resource exhaustion for incompressible resources (i.e. disk, memory). + if gracePeriodOverride == nil { + if gracePeriod < minimumGracePeriodInSeconds { + gracePeriod = minimumGracePeriodInSeconds + } + } else { + gracePeriod = *gracePeriodOverride + glog.V(2).Infof("Killing container %q, but using %d second grace period override", name, gracePeriod) } + err := dm.client.StopContainer(ID, int(gracePeriod)) if err == nil { glog.V(2).Infof("Container %q exited after %s", name, unversioned.Now().Sub(start.Time)) @@ -1460,7 +1473,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe handlerErr := dm.runner.Run(id, pod, container, container.Lifecycle.PostStart) if handlerErr != nil { err := fmt.Errorf("PostStart handler: %v", handlerErr) - dm.KillContainerInPod(id, container, pod, err.Error()) + dm.KillContainerInPod(id, container, pod, err.Error(), nil) return kubecontainer.ContainerID{}, err } } @@ -1795,7 +1808,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, _ api.PodStatus, podStatus *kubec // Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container) // TODO(random-liu): We'll use pod status directly in the future - killResult := dm.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(podStatus)) + killResult := dm.killPodWithSyncResult(pod, kubecontainer.ConvertPodStatusToRunningPod(podStatus), nil) result.AddPodSyncResult(killResult) if killResult.Error() != nil { return @@ -1819,7 +1832,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, _ api.PodStatus, podStatus *kubec } killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, containerStatus.Name) result.AddSyncResult(killContainerResult) - if err := dm.KillContainerInPod(containerStatus.ID, podContainer, pod, killMessage); err != nil { + if err := dm.KillContainerInPod(containerStatus.ID, podContainer, pod, killMessage, nil); err != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error()) glog.Errorf("Error killing container %q(id=%q) for pod %q: %v", containerStatus.Name, containerStatus.ID, format.Pod(pod), err) return @@ -1871,7 +1884,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, _ api.PodStatus, podStatus *kubec result.AddSyncResult(killContainerResult) if delErr := dm.KillContainerInPod(kubecontainer.ContainerID{ ID: string(podInfraContainerID), - Type: "docker"}, nil, pod, message); delErr != nil { + Type: "docker"}, nil, pod, message, nil); delErr != nil { killContainerResult.Fail(kubecontainer.ErrKillContainer, delErr.Error()) glog.Warningf("Clear infra container failed for pod %q: %v", format.Pod(pod), delErr) } diff --git a/pkg/kubelet/dockertools/manager_test.go b/pkg/kubelet/dockertools/manager_test.go index 842b7f0c760..885cd11ab0d 100644 --- a/pkg/kubelet/dockertools/manager_test.go +++ b/pkg/kubelet/dockertools/manager_test.go @@ -407,7 +407,7 @@ func TestKillContainerInPod(t *testing.T) { fakeDocker.SetFakeRunningContainers(containers) - if err := manager.KillContainerInPod(kubecontainer.ContainerID{}, &pod.Spec.Containers[0], pod, "test kill container in pod."); err != nil { + if err := manager.KillContainerInPod(kubecontainer.ContainerID{}, &pod.Spec.Containers[0], pod, "test kill container in pod.", nil); err != nil { t.Errorf("unexpected error: %v", err) } // Assert the container has been stopped. @@ -470,7 +470,7 @@ func TestKillContainerInPodWithPreStop(t *testing.T) { containerToKill := containers[0] fakeDocker.SetFakeRunningContainers(containers) - if err := manager.KillContainerInPod(kubecontainer.ContainerID{}, &pod.Spec.Containers[0], pod, "test kill container with preStop."); err != nil { + if err := manager.KillContainerInPod(kubecontainer.ContainerID{}, &pod.Spec.Containers[0], pod, "test kill container with preStop.", nil); err != nil { t.Errorf("unexpected error: %v", err) } // Assert the container has been stopped. @@ -507,7 +507,7 @@ func TestKillContainerInPodWithError(t *testing.T) { fakeDocker.SetFakeRunningContainers(containers) fakeDocker.InjectError("stop", fmt.Errorf("sample error")) - if err := manager.KillContainerInPod(kubecontainer.ContainerID{}, &pod.Spec.Containers[0], pod, "test kill container with error."); err == nil { + if err := manager.KillContainerInPod(kubecontainer.ContainerID{}, &pod.Spec.Containers[0], pod, "test kill container with error.", nil); err == nil { t.Errorf("expected error, found nil") } } @@ -1107,7 +1107,7 @@ func TestGetRestartCount(t *testing.T) { if cs == nil { t.Fatalf("Can't find status for container %q", containerName) } - dm.KillContainerInPod(cs.ID, &pod.Spec.Containers[0], pod, "test container restart count.") + dm.KillContainerInPod(cs.ID, &pod.Spec.Containers[0], pod, "test container restart count.", nil) } // Container "bar" starts the first time. // TODO: container lists are expected to be sorted reversely by time. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 605f20ed361..fb822ee9a37 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1649,14 +1649,14 @@ func parseResolvConf(reader io.Reader, dnsScrubber dnsScrubber) (nameservers []s // One of the following aruguements must be non-nil: runningPod, status. // TODO: Modify containerRuntime.KillPod() to accept the right arguments. -func (kl *Kubelet) killPod(pod *api.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus) error { +func (kl *Kubelet) killPod(pod *api.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error { var p kubecontainer.Pod if runningPod != nil { p = *runningPod } else if status != nil { p = kubecontainer.ConvertPodStatusToRunningPod(status) } - return kl.containerRuntime.KillPod(pod, p) + return kl.containerRuntime.KillPod(pod, p, gracePeriodOverride) } // makePodDataDirs creates the dirs for the pod datas. @@ -1734,7 +1734,7 @@ func (kl *Kubelet) syncPod(pod *api.Pod, mirrorPod *api.Pod, podStatus *kubecont // Kill pod if it should not be running if err := canRunPod(pod); err != nil || pod.DeletionTimestamp != nil || apiPodStatus.Phase == api.PodFailed { - if err := kl.killPod(pod, nil, podStatus); err != nil { + if err := kl.killPod(pod, nil, podStatus, nil); err != nil { utilruntime.HandleError(err) } return err @@ -2258,7 +2258,7 @@ func (kl *Kubelet) podKiller() { ch <- runningPod.ID }() glog.V(2).Infof("Killing unwanted pod %q", runningPod.Name) - err := kl.killPod(apiPod, runningPod, nil) + err := kl.killPod(apiPod, runningPod, nil, nil) if err != nil { glog.Errorf("Failed killing the pod %q: %v", runningPod.Name, err) } diff --git a/pkg/kubelet/rkt/rkt.go b/pkg/kubelet/rkt/rkt.go index 4652f47a222..5c8b236ffad 100644 --- a/pkg/kubelet/rkt/rkt.go +++ b/pkg/kubelet/rkt/rkt.go @@ -1041,7 +1041,7 @@ func (r *Runtime) RunPod(pod *api.Pod, pullSecrets []api.Secret) error { // This is a temporary solution until we have a clean design on how // kubelet handles events. See https://github.com/kubernetes/kubernetes/issues/23084. if err := r.runLifecycleHooks(pod, runtimePod, lifecyclePostStartHook); err != nil { - if errKill := r.KillPod(pod, *runtimePod); errKill != nil { + if errKill := r.KillPod(pod, *runtimePod, nil); errKill != nil { return errors.NewAggregate([]error{err, errKill}) } return err @@ -1285,7 +1285,8 @@ func (r *Runtime) waitPreStopHooks(pod *api.Pod, runningPod *kubecontainer.Pod) } // KillPod invokes 'systemctl kill' to kill the unit that runs the pod. -func (r *Runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { +// TODO: add support for gracePeriodOverride which is used in eviction scenarios +func (r *Runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error { glog.V(4).Infof("Rkt is killing pod: name %q.", runningPod.Name) if len(runningPod.Containers) == 0 { @@ -1416,7 +1417,7 @@ func (r *Runtime) SyncPod(pod *api.Pod, podStatus api.PodStatus, internalPodStat if restartPod { // Kill the pod only if the pod is actually running. if len(runningPod.Containers) > 0 { - if err = r.KillPod(pod, runningPod); err != nil { + if err = r.KillPod(pod, runningPod, nil); err != nil { return } }