Fix the deletion of rejected pods

This commit is contained in:
Michal Wozniak 2023-06-06 11:30:10 +02:00
parent 3e28404008
commit e3ee9b9adc
3 changed files with 184 additions and 3 deletions

View File

@ -1173,6 +1173,16 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic))
metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount))
// Complete termination of deleted pods that are not runtime pods (don't have
// running containers), are terminal, and are not known to pod workers.
// An example is pods rejected during kubelet admission that have never
// started before (i.e. does not have an orphaned pod).
// Triggering TerminatePod allows to proceed with force-deletion of such pods.
for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) {
klog.V(3).InfoS("Completing termination of terminal pods scheduled for deletion", "pod", klog.KObj(pod), "podUID", pod.UID)
kl.statusManager.TerminatePod(pod)
}
// Finally, terminate any pods that are observed in the runtime but not present in the list of
// known running pods from config. If we do terminate running runtime pods that will happen
// asynchronously in the background and those will be processed in the next invocation of
@ -1245,6 +1255,36 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error {
return nil
}
// Filters terminal pods with deletion timestamp which are not runtime pods
func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod {
terminalPodsToDelete := make(map[types.UID]*v1.Pod)
for _, pod := range allPods {
if pod.DeletionTimestamp == nil {
// skip pods which don't have a deletion timestamp
continue
}
if !kl.isAdmittedPodTerminal(pod) {
// skip non-terminal pods
continue
}
if _, knownPod := workingPods[pod.UID]; knownPod {
// skip pods known to pod workers
continue
}
if _, knownPod := kl.statusManager.GetPodStatus(pod.UID); !knownPod {
// skip pods unknown to pod status manager
continue
}
terminalPodsToDelete[pod.UID] = pod
}
for _, runningRuntimePod := range runningRuntimePods {
// skip running runtime pods - they are handled by a dedicated routine
// which terminates the containers
delete(terminalPodsToDelete, runningRuntimePod.ID)
}
return terminalPodsToDelete
}
// splitPodsByStatic separates a list of desired pods from the pod manager into
// regular or static pods. Mirror pods are not valid config sources (a mirror pod
// being created cannot cause the Kubelet to start running a static pod) and are

View File

@ -897,14 +897,16 @@ func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus, podIsFinished b
if pod.DeletionTimestamp == nil || kubetypes.IsMirrorPod(pod) {
return false
}
// Delay deletion of pods until the phase is terminal.
// Delay deletion of pods until the phase is terminal, based on pod.Status
// which comes from pod manager.
if !podutil.IsPodPhaseTerminal(pod.Status.Phase) {
klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
// For debugging purposes we also log the kubelet's local phase, when the deletion is delayed.
klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
return false
}
// If this is an update completing pod termination then we know the pod termination is finished.
if podIsFinished {
klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID)
return true
}
return false

View File

@ -381,6 +381,145 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() {
return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
})
// Regression test for https://issues.k8s.io/118472
ginkgo.It("should force-delete non-admissible pods created and deleted during kubelet restart", func(ctx context.Context) {
podName := "rejected-deleted-pod" + string(uuid.NewUUID())
gracePeriod := int64(30)
nodeName := getNodeName(ctx, f)
podSpec := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: f.Namespace.Name,
},
Spec: v1.PodSpec{
NodeName: nodeName,
NodeSelector: map[string]string{
"this-label": "does-not-exist-on-any-nodes",
},
TerminationGracePeriodSeconds: &gracePeriod,
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: podName,
Image: imageutils.GetPauseImageName(),
},
},
},
})
ginkgo.By("Stopping the kubelet")
startKubelet := stopKubelet()
// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse())
// Create the pod bound to the node. It will remain in the Pending
// phase as Kubelet is down.
ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName))
pod := e2epod.NewPodClient(f).Create(ctx, podSpec)
ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name))
err := e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod})
framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name)
// Restart Kubelet so that it proceeds with deletion
ginkgo.By("Starting the kubelet")
startKubelet()
// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue())
// Wait for the Kubelet to be ready.
gomega.Eventually(ctx, func(ctx context.Context) bool {
nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
framework.ExpectNoError(err)
return nodes == 1
}, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue())
ginkgo.By(fmt.Sprintf("After the kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name))
gomega.Eventually(ctx, func(ctx context.Context) error {
return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
})
// Regression test for an extended scenario for https://issues.k8s.io/118472
ginkgo.It("should force-delete non-admissible pods that was admitted and running before kubelet restart", func(ctx context.Context) {
nodeLabelKey := "custom-label-key-required"
nodeLabelValueRequired := "custom-label-value-required-for-admission"
podName := "rejected-deleted-run" + string(uuid.NewUUID())
gracePeriod := int64(30)
nodeName := getNodeName(ctx, f)
pod := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: f.Namespace.Name,
},
Spec: v1.PodSpec{
NodeSelector: map[string]string{
nodeLabelKey: nodeLabelValueRequired,
},
NodeName: nodeName,
TerminationGracePeriodSeconds: &gracePeriod,
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: podName,
Image: imageutils.GetPauseImageName(),
},
},
},
})
ginkgo.By(fmt.Sprintf("Adding node label for node (%v) to allow admission of pod (%v/%v)", nodeName, f.Namespace.Name, podName))
e2enode.AddOrUpdateLabelOnNode(f.ClientSet, nodeName, nodeLabelKey, nodeLabelValueRequired)
ginkgo.DeferCleanup(func() { e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey) })
// Create the pod bound to the node. It will start, but will be rejected after kubelet restart.
ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName))
pod = e2epod.NewPodClient(f).Create(ctx, pod)
ginkgo.By(fmt.Sprintf("Waiting for the pod (%v/%v) to be running", f.Namespace.Name, pod.Name))
err := e2epod.WaitForPodNameRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name)
framework.ExpectNoError(err, "Failed to await for the pod to be running: (%v/%v)", f.Namespace.Name, pod.Name)
ginkgo.By("Stopping the kubelet")
startKubelet := stopKubelet()
// wait until the kubelet health check will fail
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse())
ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name))
err = e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod})
framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name)
ginkgo.By(fmt.Sprintf("Removing node label for node (%v) to ensure the pod (%v/%v) is rejected after kubelet restart", nodeName, f.Namespace.Name, podName))
e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey)
// Restart Kubelet so that it proceeds with deletion
ginkgo.By("Starting the kubelet")
startKubelet()
// wait until the kubelet health check will succeed
gomega.Eventually(ctx, func() bool {
return kubeletHealthCheck(kubeletHealthCheckURL)
}, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue())
// Wait for the Kubelet to be ready.
gomega.Eventually(ctx, func(ctx context.Context) bool {
nodes, err := e2enode.TotalReady(ctx, f.ClientSet)
framework.ExpectNoError(err)
return nodes == 1
}, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue())
ginkgo.By(fmt.Sprintf("Once Kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name))
gomega.Eventually(ctx, func(ctx context.Context) error {
return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace)
}, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil())
})
})
})