diff --git a/hack/verify-prometheus-imports.sh b/hack/verify-prometheus-imports.sh index 36ab12937ef..ba1321e1e41 100755 --- a/hack/verify-prometheus-imports.sh +++ b/hack/verify-prometheus-imports.sh @@ -70,6 +70,7 @@ allowed_prometheus_importers=( ./staging/src/k8s.io/component-base/metrics/value.go ./staging/src/k8s.io/component-base/metrics/wrappers.go ./test/e2e/apimachinery/flowcontrol.go + ./test/e2e_node/mirror_pod_grace_period_test.go ./test/e2e/node/pods.go ./test/e2e_node/resource_metrics_test.go ./test/instrumentation/main_test.go diff --git a/pkg/kubelet/container/runtime_cache.go b/pkg/kubelet/container/runtime_cache.go index 118e10728e0..2615d68f730 100644 --- a/pkg/kubelet/container/runtime_cache.go +++ b/pkg/kubelet/container/runtime_cache.go @@ -23,11 +23,6 @@ import ( "time" ) -var ( - // TODO(yifan): Maybe set the them as parameters for NewCache(). - defaultCachePeriod = time.Second * 2 -) - // RuntimeCache is in interface for obtaining cached Pods. type RuntimeCache interface { GetPods(context.Context) ([]*Pod, error) @@ -39,9 +34,10 @@ type podsGetter interface { } // NewRuntimeCache creates a container runtime cache. -func NewRuntimeCache(getter podsGetter) (RuntimeCache, error) { +func NewRuntimeCache(getter podsGetter, cachePeriod time.Duration) (RuntimeCache, error) { return &runtimeCache{ - getter: getter, + getter: getter, + cachePeriod: cachePeriod, }, nil } @@ -53,6 +49,8 @@ type runtimeCache struct { sync.Mutex // The underlying container runtime used to update the cache. getter podsGetter + // The interval after which the cache should be refreshed. + cachePeriod time.Duration // Last time when cache was updated. cacheTime time.Time // The content of the cache. @@ -64,7 +62,7 @@ type runtimeCache struct { func (r *runtimeCache) GetPods(ctx context.Context) ([]*Pod, error) { r.Lock() defer r.Unlock() - if time.Since(r.cacheTime) > defaultCachePeriod { + if time.Since(r.cacheTime) > r.cachePeriod { if err := r.updateCache(ctx); err != nil { return nil, err } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f6d6d2c37e7..3c6a3671351 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -148,7 +148,12 @@ const ( // Duration at which housekeeping failed to satisfy the invariant that // housekeeping should be fast to avoid blocking pod config (while // housekeeping is running no new pods are started or deleted). - housekeepingWarningDuration = time.Second * 15 + housekeepingWarningDuration = time.Second * 1 + + // Period after which the runtime cache expires - set to slightly longer than + // the expected length between housekeeping periods, which explicitly refreshes + // the cache. + runtimeCacheRefreshPeriod = housekeepingPeriod + housekeepingWarningDuration // Period for performing eviction monitoring. // ensure this is kept in sync with internal cadvisor housekeeping. @@ -636,10 +641,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.reasonCache = NewReasonCache() klet.workQueue = queue.NewBasicWorkQueue(klet.clock) klet.podWorkers = newPodWorkers( - klet.syncPod, - klet.syncTerminatingPod, - klet.syncTerminatedPod, - + klet, kubeDeps.Recorder, klet.workQueue, klet.resyncInterval, @@ -685,7 +687,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.streamingRuntime = runtime klet.runner = runtime - runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime) + runtimeCache, err := kubecontainer.NewRuntimeCache(klet.containerRuntime, runtimeCacheRefreshPeriod) if err != nil { return nil, err } @@ -1562,17 +1564,18 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { kl.syncLoop(ctx, updates, kl) } -// syncPod is the transaction script for the sync of a single pod (setting up) +// SyncPod is the transaction script for the sync of a single pod (setting up) // a pod. This method is reentrant and expected to converge a pod towards the // desired state of the spec. The reverse (teardown) is handled in -// syncTerminatingPod and syncTerminatedPod. If syncPod exits without error, +// SyncTerminatingPod and SyncTerminatedPod. If SyncPod exits without error, // then the pod runtime state is in sync with the desired configuration state -// (pod is running). If syncPod exits with a transient error, the next -// invocation of syncPod is expected to make progress towards reaching the -// runtime state. syncPod exits with isTerminal when the pod was detected to +// (pod is running). If SyncPod exits with a transient error, the next +// invocation of SyncPod is expected to make progress towards reaching the +// desired state. SyncPod exits with isTerminal when the pod was detected to // have reached a terminal lifecycle phase due to container exits (for -// RestartNever or RestartOnFailure) and the next method invoked will by -// syncTerminatingPod. +// RestartNever or RestartOnFailure) and the next method invoked will be +// SyncTerminatingPod. If the pod terminates for any other reason, SyncPod +// will receive a context cancellation and should exit as soon as possible. // // Arguments: // @@ -1585,7 +1588,7 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { // // podStatus - the most recent pod status observed for this pod which can // be used to determine the set of actions that should be taken during -// this loop of syncPod +// this loop of SyncPod // // The workflow is: // - If the pod is being created, record pod worker start latency @@ -1605,18 +1608,18 @@ func (kl *Kubelet) Run(updates <-chan kubetypes.PodUpdate) { // - Update the traffic shaping for the pod's ingress and egress limits // // If any step of this workflow errors, the error is returned, and is repeated -// on the next syncPod call. +// on the next SyncPod call. // // This operation writes all events that are dispatched in order to provide // the most accurate information possible about an error situation to aid debugging. // Callers should not write an event if this operation returns an error. -func (kl *Kubelet) syncPod(_ context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) { +func (kl *Kubelet) SyncPod(_ context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) { // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. // Currently, using that context causes test failures. ctx := context.TODO() - klog.V(4).InfoS("syncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("SyncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) defer func() { - klog.V(4).InfoS("syncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal) + klog.V(4).InfoS("SyncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal) }() // Latency measurements for the main workflow are relative to the @@ -1871,35 +1874,21 @@ func (kl *Kubelet) syncPod(_ context.Context, updateType kubetypes.SyncPodType, return false, nil } -// syncTerminatingPod is expected to terminate all running containers in a pod. Once this method -// returns without error, the pod's local state can be safely cleaned up. If runningPod is passed, -// we perform no status updates. -func (kl *Kubelet) syncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { +// SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method +// returns without error, the pod is considered to be terminated and it will be safe to clean up any +// pod state that is tied to the lifetime of running containers. The next method invoked will be +// SyncTerminatedPod. This method is expected to return with the grace period provided and the +// provided context may be cancelled if the duration is exceeded. The method may also be interrupted +// with a context cancellation if the grace period is shortened by the user or the kubelet (such as +// during eviction). This method is not guaranteed to be called if a pod is force deleted from the +// configuration and the kubelet is restarted - SyncTerminatingRuntimePod handles those orphaned +// pods. +func (kl *Kubelet) SyncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. // Currently, using that context causes test failures. ctx := context.Background() - klog.V(4).InfoS("syncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) - defer klog.V(4).InfoS("syncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) - - // when we receive a runtime only pod (runningPod != nil) we don't need to update the status - // manager or refresh the status of the cache, because a successful killPod will ensure we do - // not get invoked again - if runningPod != nil { - // we kill the pod with the specified grace period since this is a termination - if gracePeriod != nil { - klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", *gracePeriod) - } else { - klog.V(4).InfoS("Pod terminating with grace period", "pod", klog.KObj(pod), "podUID", pod.UID, "gracePeriod", nil) - } - if err := kl.killPod(ctx, pod, *runningPod, gracePeriod); err != nil { - kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) - // there was an error killing the pod, so we return that error directly - utilruntime.HandleError(err) - return err - } - klog.V(4).InfoS("Pod termination stopped all running orphan containers", "pod", klog.KObj(pod), "podUID", pod.UID) - return nil - } + klog.V(4).InfoS("SyncTerminatingPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + defer klog.V(4).InfoS("SyncTerminatingPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) apiPodStatus := kl.generateAPIPodStatus(pod, podStatus) if podStatusFn != nil { @@ -1980,13 +1969,47 @@ func (kl *Kubelet) syncTerminatingPod(_ context.Context, pod *v1.Pod, podStatus return nil } -// syncTerminatedPod cleans up a pod that has terminated (has no running containers). +// SyncTerminatingRuntimePod is expected to terminate running containers in a pod that we have no +// configuration for. Once this method returns without error, any remaining local state can be safely +// cleaned up by background processes in each subsystem. Unlike syncTerminatingPod, we lack +// knowledge of the full pod spec and so cannot perform lifecycle related operations, only ensure +// that the remnant of the running pod is terminated and allow garbage collection to proceed. We do +// not update the status of the pod because with the source of configuration removed, we have no +// place to send that status. +func (kl *Kubelet) SyncTerminatingRuntimePod(_ context.Context, runningPod *kubecontainer.Pod) error { + // TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker. + // Currently, using that context causes test failures. + ctx := context.Background() + pod := runningPod.ToAPIPod() + klog.V(4).InfoS("SyncTerminatingRuntimePod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + defer klog.V(4).InfoS("SyncTerminatingRuntimePod exit", "pod", klog.KObj(pod), "podUID", pod.UID) + + // we kill the pod directly since we have lost all other information about the pod. + klog.V(4).InfoS("Orphaned running pod terminating without grace period", "pod", klog.KObj(pod), "podUID", pod.UID) + // TODO: this should probably be zero, to bypass any waiting (needs fixes in container runtime) + gracePeriod := int64(1) + if err := kl.killPod(ctx, pod, *runningPod, &gracePeriod); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) + // there was an error killing the pod, so we return that error directly + utilruntime.HandleError(err) + return err + } + klog.V(4).InfoS("Pod termination stopped all running orphaned containers", "pod", klog.KObj(pod), "podUID", pod.UID) + return nil +} + +// SyncTerminatedPod cleans up a pod that has terminated (has no running containers). // The invocations in this call are expected to tear down what PodResourcesAreReclaimed checks (which -// gates pod deletion). When this method exits the pod is expected to be ready for cleanup. -// TODO: make this method take a context and exit early -func (kl *Kubelet) syncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { - klog.V(4).InfoS("syncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) - defer klog.V(4).InfoS("syncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) +// gates pod deletion). When this method exits the pod is expected to be ready for cleanup. This method +// reduces the latency of pod cleanup but is not guaranteed to get called in all scenarios. +// +// Because the kubelet has no local store of information, all actions in this method that modify +// on-disk state must be reentrant and be garbage collected by HandlePodCleanups or a separate loop. +// This typically occurs when a pod is force deleted from configuration (local disk or API) and the +// kubelet restarts in the middle of the action. +func (kl *Kubelet) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + klog.V(4).InfoS("SyncTerminatedPod enter", "pod", klog.KObj(pod), "podUID", pod.UID) + defer klog.V(4).InfoS("SyncTerminatedPod exit", "pod", klog.KObj(pod), "podUID", pod.UID) // generate the final status of the pod // TODO: should we simply fold this into TerminatePod? that would give a single pod update @@ -2323,9 +2346,9 @@ func (kl *Kubelet) syncLoopIteration(ctx context.Context, configCh <-chan kubety } duration := time.Since(start) if duration > housekeepingWarningDuration { - klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than 15s", "seconds", duration.Seconds()) + klog.ErrorS(fmt.Errorf("housekeeping took too long"), "Housekeeping took longer than expected", "expected", housekeepingWarningDuration, "actual", duration.Round(time.Millisecond)) } - klog.V(4).InfoS("SyncLoop (housekeeping) end") + klog.V(4).InfoS("SyncLoop (housekeeping) end", "duration", duration.Round(time.Millisecond)) } } return true diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index 335af666f5c..77b0c8110be 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -54,6 +54,7 @@ import ( remotecommandserver "k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand" "k8s.io/kubernetes/pkg/kubelet/envvars" "k8s.io/kubernetes/pkg/kubelet/images" + "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util" @@ -1057,7 +1058,7 @@ func (kl *Kubelet) deleteOrphanedMirrorPods() { if err != nil { klog.ErrorS(err, "Encountered error when deleting mirror pod", "podName", podFullname) } else { - klog.V(3).InfoS("Deleted pod", "podName", podFullname) + klog.V(3).InfoS("Deleted mirror pod", "podName", podFullname) } } } @@ -1066,9 +1067,16 @@ func (kl *Kubelet) deleteOrphanedMirrorPods() { // HandlePodCleanups performs a series of cleanup work, including terminating // pod workers, killing unwanted pods, and removing orphaned volumes/pod // directories. No config changes are sent to pod workers while this method -// is executing which means no new pods can appear. -// NOTE: This function is executed by the main sync loop, so it -// should not contain any blocking calls. +// is executing which means no new pods can appear. After this method completes +// the desired state of the kubelet should be reconciled with the actual state +// in the pod worker and other pod-related components. +// +// This function is executed by the main sync loop, so it must execute quickly +// and all nested calls should be asynchronous. Any slow reconciliation actions +// should be performed by other components (like the volume manager). The duration +// of this call is the minimum latency for static pods to be restarted if they +// are updated with a fixed UID (most should use a dynamic UID), and no config +// updates are delivered to the pod workers while this method is running. func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { // The kubelet lacks checkpointing, so we need to introspect the set of pods // in the cgroup tree prior to inspecting the set of pods in our pod manager. @@ -1087,6 +1095,15 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { } allPods, mirrorPods := kl.podManager.GetPodsAndMirrorPods() + activePods := kl.filterOutInactivePods(allPods) + allRegularPods, allStaticPods := splitPodsByStatic(allPods) + activeRegularPods, activeStaticPods := splitPodsByStatic(activePods) + metrics.DesiredPodCount.WithLabelValues("").Set(float64(len(allRegularPods))) + metrics.DesiredPodCount.WithLabelValues("true").Set(float64(len(allStaticPods))) + metrics.ActivePodCount.WithLabelValues("").Set(float64(len(activeRegularPods))) + metrics.ActivePodCount.WithLabelValues("true").Set(float64(len(activeStaticPods))) + metrics.MirrorPodCount.Set(float64(len(mirrorPods))) + // Pod phase progresses monotonically. Once a pod has reached a final state, // it should never leave regardless of the restart policy. The statuses // of such pods should not be changed, and there is no need to sync them. @@ -1102,6 +1119,10 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { klog.V(3).InfoS("Clean up pod workers for terminated pods") workingPods := kl.podWorkers.SyncKnownPods(allPods) + // Reconcile: At this point the pod workers have been pruned to the set of + // desired pods. Pods that must be restarted due to UID reuse, or leftover + // pods from previous runs, are not known to the pod worker. + allPodsByUID := make(map[types.UID]*v1.Pod) for _, pod := range allPods { allPodsByUID[pod.UID] = pod @@ -1112,70 +1133,45 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { // that have already been removed from config. Pods that are terminating // will be added to possiblyRunningPods, to prevent overly aggressive // cleanup of pod cgroups. + stringIfTrue := func(t bool) string { + if t { + return "true" + } + return "" + } runningPods := make(map[types.UID]sets.Empty) possiblyRunningPods := make(map[types.UID]sets.Empty) - restartablePods := make(map[types.UID]sets.Empty) for uid, sync := range workingPods { - switch sync { + switch sync.State { case SyncPod: runningPods[uid] = struct{}{} possiblyRunningPods[uid] = struct{}{} case TerminatingPod: possiblyRunningPods[uid] = struct{}{} - case TerminatedAndRecreatedPod: - restartablePods[uid] = struct{}{} + default: } } + // Retrieve the list of running containers from the runtime to perform cleanup. + // We need the latest state to avoid delaying restarts of static pods that reuse + // a UID. + if err := kl.runtimeCache.ForceUpdateIfOlder(ctx, kl.clock.Now()); err != nil { + klog.ErrorS(err, "Error listing containers") + return err + } + runningRuntimePods, err := kl.runtimeCache.GetPods(ctx) + if err != nil { + klog.ErrorS(err, "Error listing containers") + return err + } + // Stop probing pods that are not running klog.V(3).InfoS("Clean up probes for terminated pods") kl.probeManager.CleanupPods(possiblyRunningPods) - // Terminate any pods that are observed in the runtime but not - // present in the list of known running pods from config. - runningRuntimePods, err := kl.runtimeCache.GetPods(ctx) - if err != nil { - klog.ErrorS(err, "Error listing containers") - return err - } - for _, runningPod := range runningRuntimePods { - switch workerState, ok := workingPods[runningPod.ID]; { - case ok && workerState == SyncPod, ok && workerState == TerminatingPod: - // if the pod worker is already in charge of this pod, we don't need to do anything - continue - default: - // If the pod isn't in the set that should be running and isn't already terminating, terminate - // now. This termination is aggressive because all known pods should already be in a known state - // (i.e. a removed static pod should already be terminating), so these are pods that were - // orphaned due to kubelet restart or bugs. Since housekeeping blocks other config changes, we - // know that another pod wasn't started in the background so we are safe to terminate the - // unknown pods. - if _, ok := allPodsByUID[runningPod.ID]; !ok { - klog.V(3).InfoS("Clean up orphaned pod containers", "podUID", runningPod.ID) - one := int64(1) - kl.podWorkers.UpdatePod(UpdatePodOptions{ - UpdateType: kubetypes.SyncPodKill, - RunningPod: runningPod, - KillPodOptions: &KillPodOptions{ - PodTerminationGracePeriodSecondsOverride: &one, - }, - }) - } - } - } - // Remove orphaned pod statuses not in the total list of known config pods klog.V(3).InfoS("Clean up orphaned pod statuses") kl.removeOrphanedPodStatuses(allPods, mirrorPods) - // Note that we just killed the unwanted pods. This may not have reflected - // in the cache. We need to bypass the cache to get the latest set of - // running pods to clean up the volumes. - // TODO: Evaluate the performance impact of bypassing the runtime cache. - runningRuntimePods, err = kl.containerRuntime.GetPods(ctx, false) - if err != nil { - klog.ErrorS(err, "Error listing containers") - return err - } // Remove orphaned pod user namespace allocations (if any). klog.V(3).InfoS("Clean up orphaned pod user namespace allocations") @@ -1204,6 +1200,102 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { klog.V(3).InfoS("Clean up orphaned mirror pods") kl.deleteOrphanedMirrorPods() + // At this point, the pod worker is aware of which pods are not desired (SyncKnownPods). + // We now look through the set of active pods for those that the pod worker is not aware of + // and deliver an update. The most common reason a pod is not known is because the pod was + // deleted and recreated with the same UID while the pod worker was driving its lifecycle (very + // very rare for API pods, common for static pods with fixed UIDs). Containers that may still + // be running from a previous execution must be reconciled by the pod worker's sync method. + // We must use active pods because that is the set of admitted pods (podManager includes pods + // that will never be run, and statusManager tracks already rejected pods). + var restartCount, restartCountStatic int + for _, desiredPod := range activePods { + if _, knownPod := workingPods[desiredPod.UID]; knownPod { + continue + } + + klog.V(3).InfoS("Pod will be restarted because it is in the desired set and not known to the pod workers (likely due to UID reuse)", "podUID", desiredPod.UID) + isStatic := kubetypes.IsStaticPod(desiredPod) + mirrorPod, _ := kl.podManager.GetMirrorPodByPod(desiredPod) + kl.podWorkers.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: desiredPod, + MirrorPod: mirrorPod, + }) + + // the desired pod is now known as well + workingPods[desiredPod.UID] = PodWorkerSync{State: SyncPod, HasConfig: true, Static: isStatic} + if isStatic { + // restartable static pods are the normal case + restartCountStatic++ + } else { + // almost certainly means shenanigans, as API pods should never have the same UID after being deleted and recreated + // unless there is a major API violation + restartCount++ + } + } + metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic)) + metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount)) + + // Finally, terminate any pods that are observed in the runtime but not present in the list of + // known running pods from config. If we do terminate running runtime pods that will happen + // asynchronously in the background and those will be processed in the next invocation of + // HandlePodCleanups. + var orphanCount int + for _, runningPod := range runningRuntimePods { + // If there are orphaned pod resources in CRI that are unknown to the pod worker, terminate them + // now. Since housekeeping is exclusive to other pod worker updates, we know that no pods have + // been added to the pod worker in the meantime. Note that pods that are not visible in the runtime + // but which were previously known are terminated by SyncKnownPods(). + _, knownPod := workingPods[runningPod.ID] + if !knownPod { + one := int64(1) + killPodOptions := &KillPodOptions{ + PodTerminationGracePeriodSecondsOverride: &one, + } + klog.V(2).InfoS("Clean up containers for orphaned pod we had not seen before", "podUID", runningPod.ID, "killPodOptions", killPodOptions) + kl.podWorkers.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + RunningPod: runningPod, + KillPodOptions: killPodOptions, + }) + + // the running pod is now known as well + workingPods[runningPod.ID] = PodWorkerSync{State: TerminatingPod, Orphan: true} + orphanCount++ + } + } + metrics.OrphanedRuntimePodTotal.Add(float64(orphanCount)) + + // Now that we have recorded any terminating pods, and added new pods that should be running, + // record a summary here. Not all possible combinations of PodWorkerSync values are valid. + counts := make(map[PodWorkerSync]int) + for _, sync := range workingPods { + counts[sync]++ + } + for validSync, configState := range map[PodWorkerSync]string{ + {HasConfig: true, Static: true}: "desired", + {HasConfig: true, Static: false}: "desired", + {Orphan: true, HasConfig: true, Static: true}: "orphan", + {Orphan: true, HasConfig: true, Static: false}: "orphan", + {Orphan: true, HasConfig: false}: "runtime_only", + } { + for _, state := range []PodWorkerState{SyncPod, TerminatingPod, TerminatedPod} { + validSync.State = state + count := counts[validSync] + delete(counts, validSync) + staticString := stringIfTrue(validSync.Static) + if !validSync.HasConfig { + staticString = "unknown" + } + metrics.WorkingPodCount.WithLabelValues(state.String(), configState, staticString).Set(float64(count)) + } + } + if len(counts) > 0 { + // in case a combination is lost + klog.V(3).InfoS("Programmer error, did not report a kubelet_working_pods metric for a value returned by SyncKnownPods", "counts", counts) + } + // Remove any cgroups in the hierarchy for pods that are definitely no longer // running (not in the container runtime). if kl.cgroupsPerQOS { @@ -1212,33 +1304,31 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { kl.cleanupOrphanedPodCgroups(pcm, cgroupPods, possiblyRunningPods) } + // Cleanup any backoff entries. kl.backOff.GC() - // If two pods with the same UID are observed in rapid succession, we need to - // resynchronize the pod worker after the first pod completes and decide whether - // to restart the pod. This happens last to avoid confusing the desired state - // in other components and to increase the likelihood transient OS failures during - // container start are mitigated. In general only static pods will ever reuse UIDs - // since the apiserver uses randomly generated UUIDv4 UIDs with a very low - // probability of collision. - for uid := range restartablePods { - pod, ok := allPodsByUID[uid] - if !ok { - continue - } - if kl.isAdmittedPodTerminal(pod) { - klog.V(3).InfoS("Pod is restartable after termination due to UID reuse, but pod phase is terminal", "pod", klog.KObj(pod), "podUID", pod.UID) - continue - } - start := kl.clock.Now() - mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) - klog.V(3).InfoS("Pod is restartable after termination due to UID reuse", "pod", klog.KObj(pod), "podUID", pod.UID) - kl.dispatchWork(pod, kubetypes.SyncPodCreate, mirrorPod, start) - } - return nil } +// splitPodsByStatic separates a list of desired pods from the pod manager into +// regular or static pods. Mirror pods are not valid config sources (a mirror pod +// being created cannot cause the Kubelet to start running a static pod) and are +// excluded. +func splitPodsByStatic(pods []*v1.Pod) (regular, static []*v1.Pod) { + regular, static = make([]*v1.Pod, 0, len(pods)), make([]*v1.Pod, 0, len(pods)) + for _, pod := range pods { + if kubetypes.IsMirrorPod(pod) { + continue + } + if kubetypes.IsStaticPod(pod) { + static = append(static, pod) + } else { + regular = append(regular, pod) + } + } + return regular, static +} + // validateContainerLogStatus returns the container ID for the desired container to retrieve logs for, based on the state // of the container. The previous flag will only return the logs for the last terminated container, otherwise, the current // running container is preferred over a previous termination. If info about the container is not available then a specific diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index 5752737fc94..525584aee11 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -25,9 +25,11 @@ import ( "path/filepath" "reflect" "sort" + "strings" "testing" "time" + "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" @@ -39,16 +41,11 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/diff" + utilfeature "k8s.io/apiserver/pkg/util/feature" core "k8s.io/client-go/testing" "k8s.io/client-go/tools/record" - netutils "k8s.io/utils/net" - - // TODO: remove this import if - // api.Registry.GroupOrDie(v1.GroupName).GroupVersions[0].String() is changed - // to "v1"? - - utilfeature "k8s.io/apiserver/pkg/util/feature" featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/component-base/metrics/testutil" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" _ "k8s.io/kubernetes/pkg/apis/core/install" "k8s.io/kubernetes/pkg/features" @@ -56,9 +53,11 @@ import ( containertest "k8s.io/kubernetes/pkg/kubelet/container/testing" "k8s.io/kubernetes/pkg/kubelet/cri/streaming/portforward" "k8s.io/kubernetes/pkg/kubelet/cri/streaming/remotecommand" + "k8s.io/kubernetes/pkg/kubelet/metrics" "k8s.io/kubernetes/pkg/kubelet/prober/results" "k8s.io/kubernetes/pkg/kubelet/status" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" + netutils "k8s.io/utils/net" ) func TestNodeHostsFileContent(t *testing.T) { @@ -3834,6 +3833,11 @@ func TestSortPodIPs(t *testing.T) { } } +// func init() { +// klog.InitFlags(flag.CommandLine) +// flag.CommandLine.Lookup("v").Value.Set("5") +// } + func TestConvertToAPIContainerStatusesDataRace(t *testing.T) { pod := podWithUIDNameNs("12345", "test-pod", "test-namespace") @@ -4079,3 +4083,1055 @@ func TestConvertToAPIContainerStatusesForResources(t *testing.T) { assert.Equal(t, tc.Expected, cStatuses) } } + +func TestKubelet_HandlePodCleanups(t *testing.T) { + one := int64(1) + two := int64(2) + deleted := metav1.NewTime(time.Unix(2, 0).UTC()) + type rejectedPod struct { + uid types.UID + reason string + message string + } + simplePod := func() *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns1", UID: types.UID("1")}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container-1"}, + }, + }, + } + } + withPhase := func(pod *v1.Pod, phase v1.PodPhase) *v1.Pod { + pod.Status.Phase = phase + return pod + } + staticPod := func() *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "ns1", + UID: types.UID("1"), + Annotations: map[string]string{ + kubetypes.ConfigSourceAnnotationKey: kubetypes.FileSource, + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container-1"}, + }, + }, + } + } + runtimePod := func(pod *v1.Pod) *kubecontainer.Pod { + runningPod := &kubecontainer.Pod{ + ID: types.UID(pod.UID), + Name: pod.Name, + Namespace: pod.Namespace, + Containers: []*kubecontainer.Container{ + {Name: "container-1", ID: kubecontainer.ContainerID{Type: "test", ID: "c1"}}, + }, + } + for i, container := range pod.Spec.Containers { + runningPod.Containers = append(runningPod.Containers, &kubecontainer.Container{ + Name: container.Name, + ID: kubecontainer.ContainerID{Type: "test", ID: fmt.Sprintf("c%d", i)}, + }) + } + return runningPod + } + mirrorPod := func(pod *v1.Pod, nodeName string, nodeUID types.UID) *v1.Pod { + copied := pod.DeepCopy() + if copied.Annotations == nil { + copied.Annotations = make(map[string]string) + } + copied.Annotations[kubetypes.ConfigMirrorAnnotationKey] = pod.Annotations[kubetypes.ConfigHashAnnotationKey] + isTrue := true + copied.OwnerReferences = append(copied.OwnerReferences, metav1.OwnerReference{ + APIVersion: v1.SchemeGroupVersion.String(), + Kind: "Node", + Name: nodeName, + UID: nodeUID, + Controller: &isTrue, + }) + return copied + } + + tests := []struct { + name string + pods []*v1.Pod + runtimePods []*containertest.FakePod + rejectedPods []rejectedPod + terminatingErr error + prepareWorker func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) + wantWorker func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) + wantWorkerAfterRetry func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) + wantErr bool + expectMetrics map[string]string + expectMetricsAfterRetry map[string]string + }{ + { + name: "missing pod is requested for termination with short grace period", + wantErr: false, + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(staticPod()), + }, + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + drainAllWorkers(w) + uid := types.UID("1") + // we expect runtime pods to be cleared from the status history as soon as they + // reach completion + if len(w.podSyncStatuses) != 0 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + r, ok := records[uid] + if !ok || len(r) != 1 || r[0].updateType != kubetypes.SyncPodKill || r[0].terminated || r[0].runningPod == nil || r[0].gracePeriod != nil { + t.Fatalf("unexpected pod sync records: %#v", r) + } + }, + expectMetrics: map[string]string{ + metrics.OrphanedRuntimePodTotal.FQName(): `# HELP kubelet_orphaned_runtime_pods_total [ALPHA] Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual. + # TYPE kubelet_orphaned_runtime_pods_total counter + kubelet_orphaned_runtime_pods_total 1 + `, + metrics.WorkingPodCount.FQName(): `# HELP kubelet_working_pods [ALPHA] Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible. + # TYPE kubelet_working_pods gauge + kubelet_working_pods{config="desired",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="sync",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminated",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminating",static="unknown"} 1 + `, + }, + }, + { + name: "terminating pod that errored and is not in config is notified by the cleanup", + wantErr: false, + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(simplePod()), + }, + }, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create + pod := simplePod() + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + // send a delete update + two := int64(2) + deleted := metav1.NewTime(time.Unix(2, 0).UTC()) + updatedPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "ns1", + UID: types.UID("1"), + DeletionGracePeriodSeconds: &two, + DeletionTimestamp: &deleted, + }, + Spec: v1.PodSpec{ + TerminationGracePeriodSeconds: &two, + Containers: []v1.Container{ + {Name: "container-1"}, + }, + }, + } + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + StartTime: time.Unix(3, 0).UTC(), + Pod: updatedPod, + }) + drainAllWorkers(w) + r, ok := records[updatedPod.UID] + if !ok || len(r) != 2 || r[1].gracePeriod == nil || *r[1].gracePeriod != 2 { + t.Fatalf("unexpected records: %#v", records) + } + // pod worker thinks pod1 exists, but the kubelet will not have it in the pod manager + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Fatalf("unexpected requested pod termination: %#v", s) + } + // expect we get a pod sync record for kill that should have the same grace period as before (2), but no + // running pod because the SyncKnownPods method killed it + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + expectMetrics: map[string]string{ + metrics.DesiredPodCount.FQName(): `# HELP kubelet_desired_pods [ALPHA] The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver. + # TYPE kubelet_desired_pods gauge + kubelet_desired_pods{static=""} 0 + kubelet_desired_pods{static="true"} 0 + `, + metrics.ActivePodCount.FQName(): `# HELP kubelet_active_pods [ALPHA] The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver. + # TYPE kubelet_active_pods gauge + kubelet_active_pods{static=""} 0 + kubelet_active_pods{static="true"} 0 + `, + metrics.OrphanedRuntimePodTotal.FQName(): `# HELP kubelet_orphaned_runtime_pods_total [ALPHA] Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual. + # TYPE kubelet_orphaned_runtime_pods_total counter + kubelet_orphaned_runtime_pods_total 0 + `, + metrics.WorkingPodCount.FQName(): `# HELP kubelet_working_pods [ALPHA] Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible. + # TYPE kubelet_working_pods gauge + kubelet_working_pods{config="desired",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static=""} 1 + kubelet_working_pods{config="orphan",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="sync",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminated",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminating",static="unknown"} 0 + `, + }, + wantWorkerAfterRetry: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || !s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Fatalf("unexpected requested pod termination: %#v", s) + } + // expect we get a pod sync record for kill that should have the same grace period as before (2), but no + // running pod because the SyncKnownPods method killed it + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + // after the second attempt + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + // from termination + {name: "pod1", terminated: true}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + { + name: "terminating pod that errored and is not in config or worker is force killed by the cleanup", + wantErr: false, + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(simplePod()), + }, + }, + terminatingErr: errors.New("unable to terminate"), + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Fatalf("unexpected requested pod termination: %#v", s) + } + + // ensure that we recorded the appropriate state for replays + expectedRunningPod := runtimePod(simplePod()) + if actual, expected := s.activeUpdate, (&UpdatePodOptions{ + RunningPod: expectedRunningPod, + KillPodOptions: &KillPodOptions{PodTerminationGracePeriodSecondsOverride: &one}, + }); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod activeUpdate: %s", cmp.Diff(expected, actual)) + } + + // expect that a pod the pod worker does not recognize is force killed with grace period 1 + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodKill, runningPod: expectedRunningPod}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + wantWorkerAfterRetry: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 0 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + + // expect that a pod the pod worker does not recognize is force killed with grace period 1 + expectedRunningPod := runtimePod(simplePod()) + if actual, expected := records[uid], []syncPodRecord{ + // first attempt, did not succeed + {name: "pod1", updateType: kubetypes.SyncPodKill, runningPod: expectedRunningPod}, + // second attempt, should succeed + {name: "pod1", updateType: kubetypes.SyncPodKill, runningPod: expectedRunningPod}, + // because this is a runtime pod, we don't have enough info to invoke syncTerminatedPod and so + // we exit after the retry succeeds + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + { + name: "pod is added to worker by sync method", + wantErr: false, + pods: []*v1.Pod{ + simplePod(), + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || s.IsTerminationRequested() || s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || s.IsDeleted() { + t.Fatalf("unexpected requested pod termination: %#v", s) + } + + // pod was synced once + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + expectMetrics: map[string]string{ + metrics.DesiredPodCount.FQName(): `# HELP kubelet_desired_pods [ALPHA] The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver. + # TYPE kubelet_desired_pods gauge + kubelet_desired_pods{static=""} 1 + kubelet_desired_pods{static="true"} 0 + `, + metrics.ActivePodCount.FQName(): `# HELP kubelet_active_pods [ALPHA] The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver. + # TYPE kubelet_active_pods gauge + kubelet_active_pods{static=""} 1 + kubelet_active_pods{static="true"} 0 + `, + metrics.OrphanedRuntimePodTotal.FQName(): `# HELP kubelet_orphaned_runtime_pods_total [ALPHA] Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual. + # TYPE kubelet_orphaned_runtime_pods_total counter + kubelet_orphaned_runtime_pods_total 0 + `, + // Note that this test simulates a net-new pod being discovered during HandlePodCleanups that was not + // delivered to the pod worker via HandlePodAdditions - there is no *known* scenario that can happen, but + // we want to capture it in the metric. The more likely scenario is that a static pod with a predefined + // UID is updated, which causes pod config to deliver DELETE -> ADD while the old pod is still shutting + // down and the pod worker to ignore the ADD. The HandlePodCleanups method then is responsible for syncing + // that pod to the pod worker so that it restarts. + metrics.RestartedPodTotal.FQName(): `# HELP kubelet_restarted_pods_total [ALPHA] Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods) + # TYPE kubelet_restarted_pods_total counter + kubelet_restarted_pods_total{static=""} 1 + kubelet_restarted_pods_total{static="true"} 0 + `, + metrics.WorkingPodCount.FQName(): `# HELP kubelet_working_pods [ALPHA] Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible. + # TYPE kubelet_working_pods gauge + kubelet_working_pods{config="desired",lifecycle="sync",static=""} 1 + kubelet_working_pods{config="desired",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="sync",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminated",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminating",static="unknown"} 0 + `, + }, + }, + { + name: "pod is not added to worker by sync method because it is in a terminal phase", + wantErr: false, + pods: []*v1.Pod{ + withPhase(simplePod(), v1.PodFailed), + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 0 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + // no pod sync record was delivered + if actual, expected := records[uid], []syncPodRecord(nil); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + expectMetrics: map[string]string{ + metrics.DesiredPodCount.FQName(): `# HELP kubelet_desired_pods [ALPHA] The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver. + # TYPE kubelet_desired_pods gauge + kubelet_desired_pods{static=""} 1 + kubelet_desired_pods{static="true"} 0 + `, + metrics.ActivePodCount.FQName(): `# HELP kubelet_active_pods [ALPHA] The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver. + # TYPE kubelet_active_pods gauge + kubelet_active_pods{static=""} 0 + kubelet_active_pods{static="true"} 0 + `, + metrics.OrphanedRuntimePodTotal.FQName(): `# HELP kubelet_orphaned_runtime_pods_total [ALPHA] Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual. + # TYPE kubelet_orphaned_runtime_pods_total counter + kubelet_orphaned_runtime_pods_total 0 + `, + // Note that this test simulates a net-new pod being discovered during HandlePodCleanups that was not + // delivered to the pod worker via HandlePodAdditions - there is no *known* scenario that can happen, but + // we want to capture it in the metric. The more likely scenario is that a static pod with a predefined + // UID is updated, which causes pod config to deliver DELETE -> ADD while the old pod is still shutting + // down and the pod worker to ignore the ADD. The HandlePodCleanups method then is responsible for syncing + // that pod to the pod worker so that it restarts. + metrics.RestartedPodTotal.FQName(): `# HELP kubelet_restarted_pods_total [ALPHA] Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods) + # TYPE kubelet_restarted_pods_total counter + kubelet_restarted_pods_total{static=""} 0 + kubelet_restarted_pods_total{static="true"} 0 + `, + metrics.WorkingPodCount.FQName(): `# HELP kubelet_working_pods [ALPHA] Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible. + # TYPE kubelet_working_pods gauge + kubelet_working_pods{config="desired",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="sync",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminated",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminating",static="unknown"} 0 + `, + }, + }, + { + name: "pod is not added to worker by sync method because it has been rejected", + wantErr: false, + pods: []*v1.Pod{ + simplePod(), + }, + rejectedPods: []rejectedPod{ + {uid: "1", reason: "Test", message: "rejected"}, + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 0 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + // no pod sync record was delivered + if actual, expected := records[uid], []syncPodRecord(nil); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + expectMetrics: map[string]string{ + metrics.DesiredPodCount.FQName(): `# HELP kubelet_desired_pods [ALPHA] The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver. + # TYPE kubelet_desired_pods gauge + kubelet_desired_pods{static=""} 1 + kubelet_desired_pods{static="true"} 0 + `, + metrics.ActivePodCount.FQName(): `# HELP kubelet_active_pods [ALPHA] The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver. + # TYPE kubelet_active_pods gauge + kubelet_active_pods{static=""} 0 + kubelet_active_pods{static="true"} 0 + `, + metrics.OrphanedRuntimePodTotal.FQName(): `# HELP kubelet_orphaned_runtime_pods_total [ALPHA] Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual. + # TYPE kubelet_orphaned_runtime_pods_total counter + kubelet_orphaned_runtime_pods_total 0 + `, + // Note that this test simulates a net-new pod being discovered during HandlePodCleanups that was not + // delivered to the pod worker via HandlePodAdditions - there is no *known* scenario that can happen, but + // we want to capture it in the metric. The more likely scenario is that a static pod with a predefined + // UID is updated, which causes pod config to deliver DELETE -> ADD while the old pod is still shutting + // down and the pod worker to ignore the ADD. The HandlePodCleanups method then is responsible for syncing + // that pod to the pod worker so that it restarts. + metrics.RestartedPodTotal.FQName(): `# HELP kubelet_restarted_pods_total [ALPHA] Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods) + # TYPE kubelet_restarted_pods_total counter + kubelet_restarted_pods_total{static=""} 0 + kubelet_restarted_pods_total{static="true"} 0 + `, + metrics.WorkingPodCount.FQName(): `# HELP kubelet_working_pods [ALPHA] Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible. + # TYPE kubelet_working_pods gauge + kubelet_working_pods{config="desired",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="sync",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminated",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminating",static="unknown"} 0 + `, + }, + }, + { + name: "terminating pod that is known to the config gets no update during pod cleanup", + wantErr: false, + pods: []*v1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "ns1", + UID: types.UID("1"), + DeletionGracePeriodSeconds: &two, + DeletionTimestamp: &deleted, + }, + Spec: v1.PodSpec{ + TerminationGracePeriodSeconds: &two, + Containers: []v1.Container{ + {Name: "container-1"}, + }, + }, + }, + }, + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(simplePod()), + }, + }, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "ns1", UID: types.UID("1")}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container-1"}, + }, + }, + } + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + // send a delete update + updatedPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "ns1", + UID: types.UID("1"), + DeletionGracePeriodSeconds: &two, + DeletionTimestamp: &deleted, + }, + Spec: v1.PodSpec{ + TerminationGracePeriodSeconds: &two, + Containers: []v1.Container{ + {Name: "container-1"}, + }, + }, + } + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + StartTime: time.Unix(3, 0).UTC(), + Pod: updatedPod, + }) + drainAllWorkers(w) + + // pod worker thinks pod1 is terminated and pod1 visible to config + if actual, expected := records[updatedPod.UID], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Fatalf("unexpected requested pod termination: %#v", s) + } + + // no pod sync record was delivered + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &two}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + { + name: "pod that could not start and is not in config is force terminated during pod cleanup", + wantErr: false, + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(simplePod()), + }, + }, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create of a static pod + pod := staticPod() + // block startup of the static pod due to full name collision + w.startedStaticPodsByFullname[kubecontainer.GetPodFullName(pod)] = types.UID("2") + + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + if _, ok := records[pod.UID]; ok { + t.Fatalf("unexpected records: %#v", records) + } + // pod worker is unaware of pod1 yet, and the kubelet will not have it in the pod manager + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // the pod is not started and is cleaned, but the runtime state causes us to reenter + // and perform a direct termination (we never observed the pod as being started by + // us, and so it is safe to completely tear down) + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Errorf("unexpected requested pod termination: %#v", s) + } + + // ensure that we recorded the appropriate state for replays + expectedRunningPod := runtimePod(simplePod()) + if actual, expected := s.activeUpdate, (&UpdatePodOptions{ + RunningPod: expectedRunningPod, + KillPodOptions: &KillPodOptions{PodTerminationGracePeriodSecondsOverride: &one}, + }); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod activeUpdate: %s", cmp.Diff(expected, actual)) + } + + // sync is never invoked + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodKill, runningPod: expectedRunningPod}, + // this pod is detected as an orphaned running pod and will exit + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + wantWorkerAfterRetry: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 0 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + + // expect we get a pod sync record for kill that should have the default grace period + expectedRunningPod := runtimePod(simplePod()) + if actual, expected := records[uid], []syncPodRecord{ + // first attempt, syncTerminatingPod failed with an error + {name: "pod1", updateType: kubetypes.SyncPodKill, runningPod: expectedRunningPod}, + // second attempt + {name: "pod1", updateType: kubetypes.SyncPodKill, runningPod: expectedRunningPod}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + { + name: "pod that could not start and is not in config is force terminated without runtime during pod cleanup", + wantErr: false, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create of a static pod + pod := staticPod() + // block startup of the static pod due to full name collision + w.startedStaticPodsByFullname[kubecontainer.GetPodFullName(pod)] = types.UID("2") + + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + if _, ok := records[pod.UID]; ok { + t.Fatalf("unexpected records: %#v", records) + } + // pod worker is unaware of pod1 yet, and the kubelet will not have it in the pod manager + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 0 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + + // expect that no sync calls are made, since the pod doesn't ever start + if actual, expected := records[uid], []syncPodRecord(nil); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + { + name: "pod that is terminating is recreated by config with the same UID", + wantErr: false, + pods: []*v1.Pod{ + func() *v1.Pod { + pod := staticPod() + pod.Annotations["version"] = "2" + return pod + }(), + }, + + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(staticPod()), + }, + }, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create of a static pod + pod := staticPod() + + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + // terminate the pod (which won't complete) and then deliver a recreate by that same UID + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + StartTime: time.Unix(2, 0).UTC(), + Pod: pod, + }) + pod = staticPod() + pod.Annotations["version"] = "2" + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(3, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[pod.UID], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &one}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + // pod worker is aware of pod1, but the kubelet will not have it in the pod manager + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || s.IsDeleted() || !s.restartRequested { + t.Errorf("unexpected requested pod termination: %#v", s) + } + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill, gracePeriod: &one}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + expectMetrics: map[string]string{ + metrics.DesiredPodCount.FQName(): `# HELP kubelet_desired_pods [ALPHA] The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver. + # TYPE kubelet_desired_pods gauge + kubelet_desired_pods{static=""} 0 + kubelet_desired_pods{static="true"} 1 + `, + metrics.ActivePodCount.FQName(): `# HELP kubelet_active_pods [ALPHA] The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver. + # TYPE kubelet_active_pods gauge + kubelet_active_pods{static=""} 0 + kubelet_active_pods{static="true"} 1 + `, + metrics.OrphanedRuntimePodTotal.FQName(): `# HELP kubelet_orphaned_runtime_pods_total [ALPHA] Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual. + # TYPE kubelet_orphaned_runtime_pods_total counter + kubelet_orphaned_runtime_pods_total 0 + `, + metrics.RestartedPodTotal.FQName(): `# HELP kubelet_restarted_pods_total [ALPHA] Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods) + # TYPE kubelet_restarted_pods_total counter + kubelet_restarted_pods_total{static=""} 0 + kubelet_restarted_pods_total{static="true"} 0 + `, + metrics.WorkingPodCount.FQName(): `# HELP kubelet_working_pods [ALPHA] Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible. + # TYPE kubelet_working_pods gauge + kubelet_working_pods{config="desired",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="desired",lifecycle="terminating",static="true"} 1 + kubelet_working_pods{config="orphan",lifecycle="sync",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="sync",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminated",static="true"} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static=""} 0 + kubelet_working_pods{config="orphan",lifecycle="terminating",static="true"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="sync",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminated",static="unknown"} 0 + kubelet_working_pods{config="runtime_only",lifecycle="terminating",static="unknown"} 0 + `, + }, + expectMetricsAfterRetry: map[string]string{ + metrics.RestartedPodTotal.FQName(): `# HELP kubelet_restarted_pods_total [ALPHA] Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods) + # TYPE kubelet_restarted_pods_total counter + kubelet_restarted_pods_total{static=""} 0 + kubelet_restarted_pods_total{static="true"} 1 + `, + }, + }, + { + name: "started pod that is not in config is force terminated during pod cleanup", + wantErr: false, + runtimePods: []*containertest.FakePod{ + { + Pod: runtimePod(simplePod()), + }, + }, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create of a static pod + pod := staticPod() + + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + }) + drainAllWorkers(w) + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[pod.UID], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + // pod worker is aware of pod1, but the kubelet will not have it in the pod manager + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Errorf("unexpected requested pod termination: %#v", s) + } + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + { + name: "started pod that is not in config or runtime is force terminated during pod cleanup", + wantErr: false, + runtimePods: []*containertest.FakePod{}, + terminatingErr: errors.New("unable to terminate"), + prepareWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + // send a create of a static pod + pod := staticPod() + + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + StartTime: time.Unix(1, 0).UTC(), + Pod: pod, + MirrorPod: mirrorPod(pod, "node-1", "node-uid-1"), + }) + drainAllWorkers(w) + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[pod.UID], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + // pod worker is aware of pod1, but the kubelet will not have it in the pod manager + }, + wantWorker: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Errorf("unexpected requested pod termination: %#v", s) + } + + // ensure that we recorded the appropriate state for replays + expectedPod := staticPod() + if actual, expected := s.activeUpdate, (&UpdatePodOptions{ + Pod: expectedPod, + MirrorPod: mirrorPod(expectedPod, "node-1", "node-uid-1"), + }); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod activeUpdate: %s", cmp.Diff(expected, actual)) + } + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + wantWorkerAfterRetry: func(t *testing.T, w *podWorkers, records map[types.UID][]syncPodRecord) { + uid := types.UID("1") + if len(w.podSyncStatuses) != 1 { + t.Fatalf("unexpected sync statuses: %#v", w.podSyncStatuses) + } + s, ok := w.podSyncStatuses[uid] + if !ok || !s.IsTerminationRequested() || !s.IsTerminationStarted() || !s.IsFinished() || s.IsWorking() || !s.IsDeleted() { + t.Errorf("unexpected requested pod termination: %#v", s) + } + + // ensure that we recorded the appropriate state for replays + expectedPod := staticPod() + if actual, expected := s.activeUpdate, (&UpdatePodOptions{ + Pod: expectedPod, + MirrorPod: mirrorPod(expectedPod, "node-1", "node-uid-1"), + }); !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod activeUpdate: %s", cmp.Diff(expected, actual)) + } + + // expect we get a pod sync record for kill that should have the default grace period + if actual, expected := records[uid], []syncPodRecord{ + {name: "pod1", updateType: kubetypes.SyncPodCreate}, + {name: "pod1", updateType: kubetypes.SyncPodKill}, + // second attempt at kill + {name: "pod1", updateType: kubetypes.SyncPodKill}, + {name: "pod1", terminated: true}, + }; !reflect.DeepEqual(expected, actual) { + t.Fatalf("unexpected pod sync records: %s", cmp.Diff(expected, actual, cmp.AllowUnexported(syncPodRecord{}))) + } + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // clear the metrics for testing + metrics.Register() + for _, metric := range []interface{ Reset() }{ + metrics.DesiredPodCount, + metrics.ActivePodCount, + metrics.RestartedPodTotal, + metrics.OrphanedRuntimePodTotal, + metrics.WorkingPodCount, + } { + metric.Reset() + } + metrics.MirrorPodCount.Set(0) + + testKubelet := newTestKubelet(t, false) + defer testKubelet.Cleanup() + kl := testKubelet.kubelet + + podWorkers, _, processed := createPodWorkers() + kl.podWorkers = podWorkers + originalPodSyncer := podWorkers.podSyncer + syncFuncs := newPodSyncerFuncs(originalPodSyncer) + podWorkers.podSyncer = &syncFuncs + if tt.terminatingErr != nil { + syncFuncs.syncTerminatingPod = func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { + t.Logf("called syncTerminatingPod") + if err := originalPodSyncer.SyncTerminatingPod(ctx, pod, podStatus, gracePeriod, podStatusFn); err != nil { + t.Fatalf("unexpected error in syncTerminatingPodFn: %v", err) + } + return tt.terminatingErr + } + syncFuncs.syncTerminatingRuntimePod = func(ctx context.Context, runningPod *kubecontainer.Pod) error { + if err := originalPodSyncer.SyncTerminatingRuntimePod(ctx, runningPod); err != nil { + t.Fatalf("unexpected error in syncTerminatingRuntimePodFn: %v", err) + } + return tt.terminatingErr + } + } + if tt.prepareWorker != nil { + tt.prepareWorker(t, podWorkers, processed) + } + + testKubelet.fakeRuntime.PodList = tt.runtimePods + kl.podManager.SetPods(tt.pods) + + for _, reject := range tt.rejectedPods { + pod, ok := kl.podManager.GetPodByUID(reject.uid) + if !ok { + t.Fatalf("unable to reject pod by UID %v", reject.uid) + } + kl.rejectPod(pod, reject.reason, reject.message) + } + + if err := kl.HandlePodCleanups(context.Background()); (err != nil) != tt.wantErr { + t.Errorf("Kubelet.HandlePodCleanups() error = %v, wantErr %v", err, tt.wantErr) + } + drainAllWorkers(podWorkers) + if tt.wantWorker != nil { + tt.wantWorker(t, podWorkers, processed) + } + + for k, v := range tt.expectMetrics { + testMetric(t, k, v) + } + + // check after the terminating error clears + if tt.wantWorkerAfterRetry != nil { + podWorkers.podSyncer = originalPodSyncer + if err := kl.HandlePodCleanups(context.Background()); (err != nil) != tt.wantErr { + t.Errorf("Kubelet.HandlePodCleanups() second error = %v, wantErr %v", err, tt.wantErr) + } + drainAllWorkers(podWorkers) + tt.wantWorkerAfterRetry(t, podWorkers, processed) + + for k, v := range tt.expectMetricsAfterRetry { + testMetric(t, k, v) + } + } + }) + } +} + +func testMetric(t *testing.T, metricName string, expectedMetric string) { + t.Helper() + err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(expectedMetric), metricName) + if err != nil { + t.Error(err) + } +} diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 23f6ec74739..971632a92c1 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -269,7 +269,7 @@ func newTestKubeletWithImageList( kubelet.reasonCache = NewReasonCache() kubelet.podCache = containertest.NewFakeCache(kubelet.containerRuntime) kubelet.podWorkers = &fakePodWorkers{ - syncPodFn: kubelet.syncPod, + syncPodFn: kubelet.SyncPod, cache: kubelet.podCache, t: t, } @@ -1348,7 +1348,7 @@ func TestCreateMirrorPod(t *testing.T) { pod.Annotations[kubetypes.ConfigSourceAnnotationKey] = "file" pods := []*v1.Pod{pod} kl.podManager.SetPods(pods) - isTerminal, err := kl.syncPod(context.Background(), tt.updateType, pod, nil, &kubecontainer.PodStatus{}) + isTerminal, err := kl.SyncPod(context.Background(), tt.updateType, pod, nil, &kubecontainer.PodStatus{}) assert.NoError(t, err) if isTerminal { t.Fatalf("pod should not be terminal: %#v", pod) @@ -1384,7 +1384,7 @@ func TestDeleteOutdatedMirrorPod(t *testing.T) { pods := []*v1.Pod{pod, mirrorPod} kl.podManager.SetPods(pods) - isTerminal, err := kl.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, mirrorPod, &kubecontainer.PodStatus{}) + isTerminal, err := kl.SyncPod(context.Background(), kubetypes.SyncPodUpdate, pod, mirrorPod, &kubecontainer.PodStatus{}) assert.NoError(t, err) if isTerminal { t.Fatalf("pod should not be terminal: %#v", pod) @@ -1546,7 +1546,7 @@ func TestNetworkErrorsWithoutHostNetwork(t *testing.T) { }) kubelet.podManager.SetPods([]*v1.Pod{pod}) - isTerminal, err := kubelet.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{}) + isTerminal, err := kubelet.SyncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{}) assert.Error(t, err, "expected pod with hostNetwork=false to fail when network in error") if isTerminal { t.Fatalf("pod should not be terminal: %#v", pod) @@ -1554,7 +1554,7 @@ func TestNetworkErrorsWithoutHostNetwork(t *testing.T) { pod.Annotations[kubetypes.ConfigSourceAnnotationKey] = kubetypes.FileSource pod.Spec.HostNetwork = true - isTerminal, err = kubelet.syncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{}) + isTerminal, err = kubelet.SyncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{}) assert.NoError(t, err, "expected pod with hostNetwork=true to succeed when network in error") if isTerminal { t.Fatalf("pod should not be terminal: %#v", pod) @@ -2679,7 +2679,7 @@ func TestSyncTerminatingPodKillPod(t *testing.T) { kl.podManager.SetPods(pods) podStatus := &kubecontainer.PodStatus{ID: pod.UID} gracePeriodOverride := int64(0) - err := kl.syncTerminatingPod(context.Background(), pod, podStatus, nil, &gracePeriodOverride, func(podStatus *v1.PodStatus) { + err := kl.SyncTerminatingPod(context.Background(), pod, podStatus, &gracePeriodOverride, func(podStatus *v1.PodStatus) { podStatus.Phase = v1.PodFailed podStatus.Reason = "reason" podStatus.Message = "message" diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 389e1b151d3..ca3dc026784 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -428,7 +428,7 @@ func (m *kubeGenericRuntimeManager) GetPods(ctx context.Context, all bool) ([]*k sort.SliceStable(result, func(i, j int) bool { return result[i].CreatedAt > result[j].CreatedAt }) - + klog.V(4).InfoS("Retrieved pods from runtime", "all", all) return result, nil } diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 04e37c6d7d1..5db31d5d213 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -55,6 +55,12 @@ const ( VolumeStatsHealthStatusAbnormalKey = "volume_stats_health_status_abnormal" RunningPodsKey = "running_pods" RunningContainersKey = "running_containers" + DesiredPodCountKey = "desired_pods" + ActivePodCountKey = "active_pods" + MirrorPodCountKey = "mirror_pods" + WorkingPodCountKey = "working_pods" + OrphanedRuntimePodTotalKey = "orphaned_runtime_pods_total" + RestartedPodTotalKey = "restarted_pods_total" // Metrics keys of remote runtime operations RuntimeOperationsKey = "runtime_operations_total" @@ -438,6 +444,64 @@ var ( }, []string{"container_state"}, ) + // DesiredPodCount tracks the count of pods the Kubelet thinks it should be running + DesiredPodCount = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: DesiredPodCountKey, + Help: "The number of pods the kubelet is being instructed to run. static is true if the pod is not from the apiserver.", + StabilityLevel: metrics.ALPHA, + }, + []string{"static"}, + ) + // ActivePodCount tracks the count of pods the Kubelet considers as active when deciding to admit a new pod + ActivePodCount = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: ActivePodCountKey, + Help: "The number of pods the kubelet considers active and which are being considered when admitting new pods. static is true if the pod is not from the apiserver.", + StabilityLevel: metrics.ALPHA, + }, + []string{"static"}, + ) + // MirrorPodCount tracks the number of mirror pods the Kubelet should have created for static pods + MirrorPodCount = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: MirrorPodCountKey, + Help: "The number of mirror pods the kubelet will try to create (one per admitted static pod)", + StabilityLevel: metrics.ALPHA, + }, + ) + // WorkingPodCount tracks the count of pods in each lifecycle phase, whether they are static pods, and whether they are desired, orphaned, or runtime_only + WorkingPodCount = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: KubeletSubsystem, + Name: WorkingPodCountKey, + Help: "Number of pods the kubelet is actually running, broken down by lifecycle phase, whether the pod is desired, orphaned, or runtime only (also orphaned), and whether the pod is static. An orphaned pod has been removed from local configuration or force deleted in the API and consumes resources that are not otherwise visible.", + StabilityLevel: metrics.ALPHA, + }, + []string{"lifecycle", "config", "static"}, + ) + // OrphanedRuntimePodTotal is incremented every time a pod is detected in the runtime without being known to the pod worker first + OrphanedRuntimePodTotal = metrics.NewCounter( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: OrphanedRuntimePodTotalKey, + Help: "Number of pods that have been detected in the container runtime without being already known to the pod worker. This typically indicates the kubelet was restarted while a pod was force deleted in the API or in the local configuration, which is unusual.", + StabilityLevel: metrics.ALPHA, + }, + ) + // RestartedPodTotal is incremented every time a pod with the same UID is deleted and recreated + RestartedPodTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: KubeletSubsystem, + Name: RestartedPodTotalKey, + Help: "Number of pods that have been restarted because they were deleted and recreated with the same UID while the kubelet was watching them (common for static pods, extremely uncommon for API pods)", + StabilityLevel: metrics.ALPHA, + }, + []string{"static"}, + ) // StartedPodsTotal is a counter that tracks pod sandbox creation operations StartedPodsTotal = metrics.NewCounter( &metrics.CounterOpts{ @@ -615,6 +679,12 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(DevicePluginAllocationDuration) legacyregistry.MustRegister(RunningContainerCount) legacyregistry.MustRegister(RunningPodCount) + legacyregistry.MustRegister(DesiredPodCount) + legacyregistry.MustRegister(ActivePodCount) + legacyregistry.MustRegister(MirrorPodCount) + legacyregistry.MustRegister(WorkingPodCount) + legacyregistry.MustRegister(OrphanedRuntimePodTotal) + legacyregistry.MustRegister(RestartedPodTotal) legacyregistry.MustRegister(ManagedEphemeralContainers) if utilfeature.DefaultFeatureGate.Enabled(features.KubeletPodResources) { legacyregistry.MustRegister(PodResourcesEndpointRequestsTotalCount) diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go index c43cbeb6f14..5c4042a4234 100644 --- a/pkg/kubelet/pod_workers.go +++ b/pkg/kubelet/pod_workers.go @@ -36,6 +36,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/metrics" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/queue" + "k8s.io/utils/clock" ) // OnCompleteFunc is a function that is invoked when an operation completes. @@ -86,24 +87,8 @@ type UpdatePodOptions struct { KillPodOptions *KillPodOptions } -// PodWorkType classifies the three phases of pod lifecycle - setup (sync), -// teardown of containers (terminating), cleanup (terminated). -type PodWorkType int - -const ( - // SyncPodWork is when the pod is expected to be started and running. - SyncPodWork PodWorkType = iota - // TerminatingPodWork is when the pod is no longer being set up, but some - // containers may be running and are being torn down. - TerminatingPodWork - // TerminatedPodWork indicates the pod is stopped, can have no more running - // containers, and any foreground cleanup can be executed. - TerminatedPodWork -) - // PodWorkType classifies the status of pod as seen by the pod worker - setup (sync), -// teardown of containers (terminating), cleanup (terminated), or recreated with the -// same UID (kill -> create while terminating) +// teardown of containers (terminating), or cleanup (terminated). type PodWorkerState int const ( @@ -115,17 +100,39 @@ const ( // TerminatedPod indicates the pod is stopped, can have no more running // containers, and any foreground cleanup can be executed. TerminatedPod - // TerminatedAndRecreatedPod indicates that after the pod was terminating a - // request to recreate the pod was received. The pod is terminated and can - // now be restarted by sending a create event to the pod worker. - TerminatedAndRecreatedPod ) +func (state PodWorkerState) String() string { + switch state { + case SyncPod: + return "sync" + case TerminatingPod: + return "terminating" + case TerminatedPod: + return "terminated" + default: + panic(fmt.Sprintf("the state %d is not defined", state)) + } +} + +// PodWorkerSync is the summarization of a single pod worker for sync. Values +// besides state are used to provide metric counts for operators. +type PodWorkerSync struct { + // State of the pod. + State PodWorkerState + // Orphan is true if the pod is no longer in the desired set passed to SyncKnownPods. + Orphan bool + // HasConfig is true if we have a historical pod spec for this pod. + HasConfig bool + // Static is true if we have config and the pod came from a static source. + Static bool +} + // podWork is the internal changes type podWork struct { // WorkType is the type of sync to perform - sync (create), terminating (stop // containers), terminated (clean up and write status). - WorkType PodWorkType + WorkType PodWorkerState // Options contains the data to sync. Options UpdatePodOptions @@ -147,8 +154,10 @@ type PodWorkers interface { // and have been terminated for a significant period of time. Once this method // has been called once, the workers are assumed to be fully initialized and // subsequent calls to ShouldPodContentBeRemoved on unknown pods will return - // true. It returns a map describing the state of each known pod worker. - SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkerState + // true. It returns a map describing the state of each known pod worker. It + // is the responsibility of the caller to re-add any desired pods that are not + // returned as knownPods. + SyncKnownPods(desiredPods []*v1.Pod) (knownPods map[types.UID]PodWorkerSync) // IsPodKnownTerminated returns true if the provided pod UID is known by the pod // worker to be terminated. If the pod has been force deleted and the pod worker @@ -217,15 +226,72 @@ type PodWorkers interface { IsPodForMirrorPodTerminatingByFullName(podFullname string) bool } -// the function to invoke to perform a sync (reconcile the kubelet state to the desired shape of the pod) +// podSyncer describes the core lifecyle operations of the pod state machine. A pod is first +// synced until it naturally reaches termination (true is returned) or an external agent decides +// the pod should be terminated. Once a pod should be terminating, SyncTerminatingPod is invoked +// until it returns no error. Then the SyncTerminatedPod method is invoked until it exits without +// error, and the pod is considered terminal. Implementations of this interface must be threadsafe +// for simultaneous invocation of these methods for multiple pods. +type podSyncer interface { + // SyncPod configures the pod and starts and restarts all containers. If it returns true, the + // pod has reached a terminal state and the presence of the error indicates succeeded or failed. + // If an error is returned, the sync was not successful and should be rerun in the future. This + // is a long running method and should exit early with context.Canceled if the context is canceled. + SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) + // SyncTerminatingPod attempts to ensure the pod's containers are no longer running and to collect + // any final status. This method is repeatedly invoked with diminishing grace periods until it exits + // without error. Once this method exits with no error other components are allowed to tear down + // supporting resources like volumes and devices. If the context is canceled, the method should + // return context.Canceled unless it has successfully finished, which may occur when a shorter + // grace period is detected. + SyncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error + // SyncTerminatingRuntimePod is invoked when running containers are found that correspond to + // a pod that is no longer known to the kubelet to terminate those containers. It should not + // exit without error unless all containers are known to be stopped. + SyncTerminatingRuntimePod(ctx context.Context, runningPod *kubecontainer.Pod) error + // SyncTerminatedPod is invoked after all running containers are stopped and is responsible + // for releasing resources that should be executed right away rather than in the background. + // Once it exits without error the pod is considered finished on the node. + SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error +} + type syncPodFnType func(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) - -// the function to invoke to terminate a pod (ensure no running processes are present) -type syncTerminatingPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error - -// the function to invoke to cleanup a pod that is terminated +type syncTerminatingPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error +type syncTerminatingRuntimePodFnType func(ctx context.Context, runningPod *kubecontainer.Pod) error type syncTerminatedPodFnType func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error +// podSyncerFuncs implements podSyncer and accepts functions for each method. +type podSyncerFuncs struct { + syncPod syncPodFnType + syncTerminatingPod syncTerminatingPodFnType + syncTerminatingRuntimePod syncTerminatingRuntimePodFnType + syncTerminatedPod syncTerminatedPodFnType +} + +func newPodSyncerFuncs(s podSyncer) podSyncerFuncs { + return podSyncerFuncs{ + syncPod: s.SyncPod, + syncTerminatingPod: s.SyncTerminatingPod, + syncTerminatingRuntimePod: s.SyncTerminatingRuntimePod, + syncTerminatedPod: s.SyncTerminatedPod, + } +} + +var _ podSyncer = podSyncerFuncs{} + +func (f podSyncerFuncs) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod *v1.Pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { + return f.syncPod(ctx, updateType, pod, mirrorPod, podStatus) +} +func (f podSyncerFuncs) SyncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { + return f.syncTerminatingPod(ctx, pod, podStatus, gracePeriod, podStatusFn) +} +func (f podSyncerFuncs) SyncTerminatingRuntimePod(ctx context.Context, runningPod *kubecontainer.Pod) error { + return f.syncTerminatingRuntimePod(ctx, runningPod) +} +func (f podSyncerFuncs) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + return f.syncTerminatedPod(ctx, pod, podStatus) +} + const ( // jitter factor for resyncInterval workerResyncIntervalJitterFactor = 0.5 @@ -241,36 +307,69 @@ const ( // worker sync (setup, terminating, terminated). type podSyncStatus struct { // ctx is the context that is associated with the current pod sync. + // TODO: remove this from the struct by having the context initialized + // in startPodSync, the cancelFn used by UpdatePod, and cancellation of + // a parent context for tearing down workers (if needed) on shutdown ctx context.Context - // cancelFn if set is expected to cancel the current sync*Pod operation. + // cancelFn if set is expected to cancel the current podSyncer operation. cancelFn context.CancelFunc - // working is true if a pod worker is currently in a sync method. - working bool + // fullname of the pod fullname string + // working is true if an update is pending or being worked by a pod worker + // goroutine. + working bool + // pendingUpdate is the updated state the pod worker should observe. It is + // cleared and moved to activeUpdate when a pod worker reads it. A new update + // may always replace a pending update as the pod worker does not guarantee + // that all intermediate states are synced to a worker, only the most recent. + // This state will not be visible to downstream components until a pod worker + // has begun processing it. + pendingUpdate *UpdatePodOptions + // activeUpdate is the most recent version of the pod's state that will be + // passed to a sync*Pod function. A pod becomes visible to downstream components + // once a worker decides to start a pod (startedAt is set). The pod and mirror + // pod fields are accumulated if they are missing on a particular call (the last + // known version), and the value of KillPodOptions is accumulated as pods cannot + // have their grace period shortened. This is the source of truth for the pod spec + // the kubelet is reconciling towards for all components that act on running pods. + activeUpdate *UpdatePodOptions + // syncedAt is the time at which the pod worker first observed this pod. syncedAt time.Time + // startedAt is the time at which the pod worker allowed the pod to start. + startedAt time.Time // terminatingAt is set once the pod is requested to be killed - note that // this can be set before the pod worker starts terminating the pod, see // terminating. terminatingAt time.Time + // terminatedAt is set once the pod worker has completed a successful + // syncTerminatingPod call and means all running containers are stopped. + terminatedAt time.Time + // gracePeriod is the requested gracePeriod once terminatingAt is nonzero. + gracePeriod int64 + // notifyPostTerminating will be closed once the pod transitions to + // terminated. After the pod is in terminated state, nothing should be + // added to this list. + notifyPostTerminating []chan<- struct{} + // statusPostTerminating is a list of the status changes associated + // with kill pod requests. After the pod is in terminated state, nothing + // should be added to this list. The worker will execute the last function + // in this list on each termination attempt. + statusPostTerminating []PodStatusFunc + // startedTerminating is true once the pod worker has observed the request to // stop a pod (exited syncPod and observed a podWork with WorkType - // TerminatingPodWork). Once this is set, it is safe for other components + // TerminatingPod). Once this is set, it is safe for other components // of the kubelet to assume that no other containers may be started. startedTerminating bool // deleted is true if the pod has been marked for deletion on the apiserver // or has no configuration represented (was deleted before). deleted bool - // gracePeriod is the requested gracePeriod once terminatingAt is nonzero. - gracePeriod int64 // evicted is true if the kill indicated this was an eviction (an evicted // pod can be more aggressively cleaned up). evicted bool - // terminatedAt is set once the pod worker has completed a successful - // syncTerminatingPod call and means all running containers are stopped. - terminatedAt time.Time // finished is true once the pod worker completes for a pod // (syncTerminatedPod exited with no errors) until SyncKnownPods is invoked // to remove the pod. A terminal pod (Succeeded/Failed) will have @@ -281,15 +380,12 @@ type podSyncStatus struct { // it has been killed. When known pods are synced, any pod that is // terminated and has restartRequested will have its history cleared. restartRequested bool - // notifyPostTerminating will be closed once the pod transitions to - // terminated. After the pod is in terminated state, nothing should be - // added to this list. - notifyPostTerminating []chan<- struct{} - // statusPostTerminating is a list of the status changes associated - // with kill pod requests. After the pod is in terminated state, nothing - // should be added to this list. The worker will execute the last function - // in this list on each termination attempt. - statusPostTerminating []PodStatusFunc + // observedRuntime is true if the pod has been observed to be present in the + // runtime. A pod that has been observed at runtime must go through either + // SyncTerminatingRuntimePod or SyncTerminatingPod. Otherwise, we can avoid + // invoking the terminating methods if the pod is deleted or orphaned before + // it has been started. + observedRuntime bool } func (s *podSyncStatus) IsWorking() bool { return s.working } @@ -299,6 +395,58 @@ func (s *podSyncStatus) IsTerminated() bool { return !s.terminatedAt.I func (s *podSyncStatus) IsFinished() bool { return s.finished } func (s *podSyncStatus) IsEvicted() bool { return s.evicted } func (s *podSyncStatus) IsDeleted() bool { return s.deleted } +func (s *podSyncStatus) IsStarted() bool { return !s.startedAt.IsZero() } + +// WorkType returns this pods' current state of the pod in pod lifecycle state machine. +func (s *podSyncStatus) WorkType() PodWorkerState { + if s.IsTerminated() { + return TerminatedPod + } + if s.IsTerminationRequested() { + return TerminatingPod + } + return SyncPod +} + +// mergeLastUpdate records the most recent state from a new update. Pod and MirrorPod are +// incremented. KillPodOptions is accumulated. If RunningPod is set, Pod is synthetic and +// will *not* be used as the last pod state unless no previous pod state exists (because +// the pod worker may be responsible for terminating a pod from a previous run of the +// kubelet where no config state is visible). The contents of activeUpdate are used as the +// source of truth for components downstream of the pod workers. +func (s *podSyncStatus) mergeLastUpdate(other UpdatePodOptions) { + opts := s.activeUpdate + if opts == nil { + opts = &UpdatePodOptions{} + s.activeUpdate = opts + } + + // UpdatePodOptions states (and UpdatePod enforces) that either Pod or RunningPod + // is set, and we wish to preserve the most recent Pod we have observed, so only + // overwrite our Pod when we have no Pod or when RunningPod is nil. + if opts.Pod == nil || other.RunningPod == nil { + opts.Pod = other.Pod + } + // running pods will not persist but will be remembered for replay + opts.RunningPod = other.RunningPod + // if mirrorPod was not provided, remember the last one for replay + if other.MirrorPod != nil { + opts.MirrorPod = other.MirrorPod + } + // accumulate kill pod options + if other.KillPodOptions != nil { + opts.KillPodOptions = &KillPodOptions{} + if other.KillPodOptions.Evict { + opts.KillPodOptions.Evict = true + } + if override := other.KillPodOptions.PodTerminationGracePeriodSecondsOverride; override != nil { + value := *override + opts.KillPodOptions.PodTerminationGracePeriodSecondsOverride = &value + } + } + // StartTime is not copied - that is purely for tracking latency of config propagation + // from kubelet to pod worker. +} // podWorkers keeps track of operations on pods and ensures each pod is // reconciled with the container runtime and other subsystems. The worker @@ -306,6 +454,16 @@ func (s *podSyncStatus) IsDeleted() bool { return s.deleted } // shutting down but still have running containers, and which pods have // terminated recently and are guaranteed to have no running containers. // +// podWorkers is the source of truth for what pods should be active on a +// node at any time, and is kept up to date with the desired state of the +// node (tracked by the kubelet pod config loops and the state in the +// kubelet's podManager) via the UpdatePod method. Components that act +// upon running pods should look to the pod worker for state instead of the +// kubelet podManager. The pod worker is periodically reconciled with the +// state of the podManager via SyncKnownPods() and is responsible for +// ensuring the completion of all observed pods no longer present in +// the podManager (no longer part of the node's desired config). +// // A pod passed to a pod worker is either being synced (expected to be // running), terminating (has running containers but no new containers are // expected to start), terminated (has no running containers but may still @@ -382,15 +540,16 @@ type podWorkers struct { // podsSynced is true once the pod worker has been synced at least once, // which means that all working pods have been started via UpdatePod(). podsSynced bool + // Tracks all running per-pod goroutines - per-pod goroutine will be - // processing updates received through its corresponding channel. - podUpdates map[types.UID]chan podWork - // Tracks the last undelivered work item for this pod - a work item is - // undelivered if it comes in while the worker is working. - lastUndeliveredWorkUpdate map[types.UID]podWork + // processing updates received through its corresponding channel. Sending + // a message on this channel will signal the corresponding goroutine to + // consume podSyncStatuses[uid].pendingUpdate if set. + podUpdates map[types.UID]chan struct{} // Tracks by UID the termination status of a pod - syncing, terminating, // terminated, and evicted. podSyncStatuses map[types.UID]*podSyncStatus + // Tracks all uids for started static pods by full name startedStaticPodsByFullname map[string]types.UID // Tracks all uids for static pods that are waiting to start by full name @@ -401,15 +560,12 @@ type podWorkers struct { // This function is run to sync the desired state of pod. // NOTE: This function has to be thread-safe - it can be called for // different pods at the same time. - - syncPodFn syncPodFnType - syncTerminatingPodFn syncTerminatingPodFnType - syncTerminatedPodFn syncTerminatedPodFnType + podSyncer podSyncer // workerChannelFn is exposed for testing to allow unit tests to impose delays // in channel communication. The function is invoked once each time a new worker // goroutine starts. - workerChannelFn func(uid types.UID, in chan podWork) (out <-chan podWork) + workerChannelFn func(uid types.UID, in chan struct{}) (out <-chan struct{}) // The EventRecorder to use recorder record.EventRecorder @@ -422,12 +578,13 @@ type podWorkers struct { // podCache stores kubecontainer.PodStatus for all pods. podCache kubecontainer.Cache + + // clock is used for testing timing + clock clock.PassiveClock } func newPodWorkers( - syncPodFn syncPodFnType, - syncTerminatingPodFn syncTerminatingPodFnType, - syncTerminatedPodFn syncTerminatedPodFnType, + podSyncer podSyncer, recorder record.EventRecorder, workQueue queue.WorkQueue, resyncInterval, backOffPeriod time.Duration, @@ -435,18 +592,16 @@ func newPodWorkers( ) PodWorkers { return &podWorkers{ podSyncStatuses: map[types.UID]*podSyncStatus{}, - podUpdates: map[types.UID]chan podWork{}, - lastUndeliveredWorkUpdate: map[types.UID]podWork{}, + podUpdates: map[types.UID]chan struct{}{}, startedStaticPodsByFullname: map[string]types.UID{}, waitingToStartStaticPodsByFullname: map[string][]types.UID{}, - syncPodFn: syncPodFn, - syncTerminatingPodFn: syncTerminatingPodFn, - syncTerminatedPodFn: syncTerminatedPodFn, + podSyncer: podSyncer, recorder: recorder, workQueue: workQueue, resyncInterval: resyncInterval, backOffPeriod: backOffPeriod, podCache: podCache, + clock: clock.RealClock{}, } } @@ -552,46 +707,52 @@ func isPodStatusCacheTerminal(status *kubecontainer.PodStatus) bool { } // UpdatePod carries a configuration change or termination state to a pod. A pod is either runnable, -// terminating, or terminated, and will transition to terminating if deleted on the apiserver, it is -// discovered to have a terminal phase (Succeeded or Failed), or if it is evicted by the kubelet. +// terminating, or terminated, and will transition to terminating if: deleted on the apiserver, +// discovered to have a terminal phase (Succeeded or Failed), or evicted by the kubelet. func (p *podWorkers) UpdatePod(options UpdatePodOptions) { - // handle when the pod is an orphan (no config) and we only have runtime status by running only - // the terminating part of the lifecycle - pod := options.Pod + // Handle when the pod is an orphan (no config) and we only have runtime status by running only + // the terminating part of the lifecycle. A running pod contains only a minimal set of information + // about the pod var isRuntimePod bool - if options.RunningPod != nil { + var uid types.UID + var name, ns string + if runningPod := options.RunningPod; runningPod != nil { if options.Pod == nil { - pod = options.RunningPod.ToAPIPod() + // the sythetic pod created here is used only as a placeholder and not tracked if options.UpdateType != kubetypes.SyncPodKill { - klog.InfoS("Pod update is ignored, runtime pods can only be killed", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.InfoS("Pod update is ignored, runtime pods can only be killed", "pod", klog.KRef(runningPod.Namespace, runningPod.Name), "podUID", runningPod.ID, "updateType", options.UpdateType) return } - options.Pod = pod + uid, ns, name = runningPod.ID, runningPod.Namespace, runningPod.Name isRuntimePod = true } else { options.RunningPod = nil - klog.InfoS("Pod update included RunningPod which is only valid when Pod is not specified", "pod", klog.KObj(options.Pod), "podUID", options.Pod.UID) + uid, ns, name = options.Pod.UID, options.Pod.Namespace, options.Pod.Name + klog.InfoS("Pod update included RunningPod which is only valid when Pod is not specified", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) } + } else { + uid, ns, name = options.Pod.UID, options.Pod.Namespace, options.Pod.Name } - uid := pod.UID p.podLock.Lock() defer p.podLock.Unlock() // decide what to do with this pod - we are either setting it up, tearing it down, or ignoring it - now := time.Now() + var firstTime bool + now := p.clock.Now() status, ok := p.podSyncStatuses[uid] if !ok { - klog.V(4).InfoS("Pod is being synced for the first time", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is being synced for the first time", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) + firstTime = true status = &podSyncStatus{ syncedAt: now, - fullname: kubecontainer.GetPodFullName(pod), + fullname: kubecontainer.BuildPodFullName(name, ns), } // if this pod is being synced for the first time, we need to make sure it is an active pod - if !isRuntimePod && (pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded) { + if options.Pod != nil && (options.Pod.Status.Phase == v1.PodFailed || options.Pod.Status.Phase == v1.PodSucceeded) { // check to see if the pod is not running and the pod is terminal. // If this succeeds then record in the podWorker that it is terminated. - if statusCache, err := p.podCache.Get(pod.UID); err == nil { + if statusCache, err := p.podCache.Get(uid); err == nil { if isPodStatusCacheTerminal(statusCache) { status = &podSyncStatus{ terminatedAt: now, @@ -599,7 +760,7 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { syncedAt: now, startedTerminating: true, finished: true, - fullname: kubecontainer.GetPodFullName(pod), + fullname: kubecontainer.BuildPodFullName(name, ns), } } } @@ -607,22 +768,46 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { p.podSyncStatuses[uid] = status } - // if an update is received that implies the pod should be running, but we are already terminating a pod by - // that UID, assume that two pods with the same UID were created in close temporal proximity (usually static - // pod but it's possible for an apiserver to extremely rarely do something similar) - flag the sync status - // to indicate that after the pod terminates it should be reset to "not running" to allow a subsequent add/update - // to start the pod worker again - if status.IsTerminationRequested() { + // RunningPods represent an unknown pod execution and don't contain pod spec information + // sufficient to perform any action other than termination. If we received a RunningPod + // after a real pod has already been provided, use the most recent spec instead. Also, + // once we observe a runtime pod we must drive it to completion, even if we weren't the + // ones who started it. + pod := options.Pod + if isRuntimePod { + status.observedRuntime = true + switch { + case status.pendingUpdate != nil && status.pendingUpdate.Pod != nil: + pod = status.pendingUpdate.Pod + options.Pod = pod + options.RunningPod = nil + case status.activeUpdate != nil && status.activeUpdate.Pod != nil: + pod = status.activeUpdate.Pod + options.Pod = pod + options.RunningPod = nil + default: + // we will continue to use RunningPod.ToAPIPod() as pod here, but + // options.Pod will be nil and other methods must handle that appropriately. + pod = options.RunningPod.ToAPIPod() + } + } + + // When we see a create update on an already terminating pod, that implies two pods with the same UID were created in + // close temporal proximity (usually static pod but it's possible for an apiserver to extremely rarely do something + // similar) - flag the sync status to indicate that after the pod terminates it should be reset to "not running" to + // allow a subsequent add/update to start the pod worker again. This does not apply to the first time we see a pod, + // such as when the kubelet restarts and we see already terminated pods for the first time. + if !firstTime && status.IsTerminationRequested() { if options.UpdateType == kubetypes.SyncPodCreate { status.restartRequested = true - klog.V(4).InfoS("Pod is terminating but has been requested to restart with same UID, will be reconciled later", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is terminating but has been requested to restart with same UID, will be reconciled later", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) return } } // once a pod is terminated by UID, it cannot reenter the pod worker (until the UID is purged by housekeeping) if status.IsFinished() { - klog.V(4).InfoS("Pod is finished processing, no further updates", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is finished processing, no further updates", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) return } @@ -631,25 +816,25 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { if !status.IsTerminationRequested() { switch { case isRuntimePod: - klog.V(4).InfoS("Pod is orphaned and must be torn down", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is orphaned and must be torn down", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) status.deleted = true status.terminatingAt = now becameTerminating = true case pod.DeletionTimestamp != nil: - klog.V(4).InfoS("Pod is marked for graceful deletion, begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is marked for graceful deletion, begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) status.deleted = true status.terminatingAt = now becameTerminating = true case pod.Status.Phase == v1.PodFailed, pod.Status.Phase == v1.PodSucceeded: - klog.V(4).InfoS("Pod is in a terminal phase (success/failed), begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is in a terminal phase (success/failed), begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) status.terminatingAt = now becameTerminating = true case options.UpdateType == kubetypes.SyncPodKill: if options.KillPodOptions != nil && options.KillPodOptions.Evict { - klog.V(4).InfoS("Pod is being evicted by the kubelet, begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is being evicted by the kubelet, begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) status.evicted = true } else { - klog.V(4).InfoS("Pod is being removed by the kubelet, begin teardown", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is being removed by the kubelet, begin teardown", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) } status.terminatingAt = now becameTerminating = true @@ -657,7 +842,6 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { } // once a pod is terminating, all updates are kills and the grace period can only decrease - var workType PodWorkType var wasGracePeriodShortened bool switch { case status.IsTerminated(): @@ -665,12 +849,10 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { // due to housekeeping seeing an older cached version of the runtime pod simply ignore it until // after the pod worker completes. if isRuntimePod { - klog.V(3).InfoS("Pod is waiting for termination, ignoring runtime-only kill until after pod worker is fully terminated", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(3).InfoS("Pod is waiting for termination, ignoring runtime-only kill until after pod worker is fully terminated", "pod", klog.KRef(ns, name), "podUID", uid, "updateType", options.UpdateType) return } - workType = TerminatedPodWork - if options.KillPodOptions != nil { if ch := options.KillPodOptions.CompletedCh; ch != nil { close(ch) @@ -679,7 +861,6 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { options.KillPodOptions = nil case status.IsTerminationRequested(): - workType = TerminatingPodWork if options.KillPodOptions == nil { options.KillPodOptions = &KillPodOptions{} } @@ -700,8 +881,6 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { options.KillPodOptions.PodTerminationGracePeriodSecondsOverride = &gracePeriod default: - workType = SyncPodWork - // KillPodOptions is not valid for sync actions outside of the terminating phase if options.KillPodOptions != nil { if ch := options.KillPodOptions.CompletedCh; ch != nil { @@ -711,20 +890,11 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { } } - // the desired work we want to be performing - work := podWork{ - WorkType: workType, - Options: options, - } - // start the pod worker goroutine if it doesn't exist podUpdates, exists := p.podUpdates[uid] if !exists { - // We need to have a buffer here, because checkForUpdates() method that - // puts an update into channel is called from the same goroutine where - // the channel is consumed. However, it is guaranteed that in such case - // the channel is empty, so buffer of size 1 is enough. - podUpdates = make(chan podWork, 1) + // buffer the channel to avoid blocking this method + podUpdates = make(chan struct{}, 1) p.podUpdates[uid] = podUpdates // ensure that static pods start in the order they are received by UpdatePod @@ -734,45 +904,40 @@ func (p *podWorkers) UpdatePod(options UpdatePodOptions) { } // allow testing of delays in the pod update channel - var outCh <-chan podWork + var outCh <-chan struct{} if p.workerChannelFn != nil { outCh = p.workerChannelFn(uid, podUpdates) } else { outCh = podUpdates } - // Creating a new pod worker either means this is a new pod, or that the - // kubelet just restarted. In either case the kubelet is willing to believe - // the status of the pod for the first pod worker sync. See corresponding - // comment in syncPod. + // spawn a pod worker go func() { + // TODO: this should be a wait.Until with backoff to handle panics, and + // accept a context for shutdown defer runtime.HandleCrash() - p.managePodLoop(outCh) + defer klog.V(3).InfoS("Pod worker has stopped", "podUID", uid) + p.podWorkerLoop(uid, outCh) }() } - // dispatch a request to the pod worker if none are running - if !status.IsWorking() { - status.working = true - podUpdates <- work - return + // measure the maximum latency between a call to UpdatePod and when the pod worker reacts to it + // by preserving the oldest StartTime + if status.pendingUpdate != nil && !status.pendingUpdate.StartTime.IsZero() && status.pendingUpdate.StartTime.Before(options.StartTime) { + options.StartTime = status.pendingUpdate.StartTime } - // capture the maximum latency between a requested update and when the pod - // worker observes it - if undelivered, ok := p.lastUndeliveredWorkUpdate[pod.UID]; ok { - // track the max latency between when a config change is requested and when it is realized - // NOTE: this undercounts the latency when multiple requests are queued, but captures max latency - if !undelivered.Options.StartTime.IsZero() && undelivered.Options.StartTime.Before(work.Options.StartTime) { - work.Options.StartTime = undelivered.Options.StartTime - } + // notify the pod worker there is a pending update + status.pendingUpdate = &options + status.working = true + klog.V(4).InfoS("Notifying pod of pending update", "pod", klog.KRef(ns, name), "podUID", uid, "workType", status.WorkType()) + select { + case podUpdates <- struct{}{}: + default: } - // always sync the most recent data - p.lastUndeliveredWorkUpdate[pod.UID] = work - if (becameTerminating || wasGracePeriodShortened) && status.cancelFn != nil { - klog.V(3).InfoS("Cancelling current pod sync", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", work.WorkType) + klog.V(3).InfoS("Cancelling current pod sync", "pod", klog.KRef(ns, name), "podUID", uid, "workType", status.WorkType()) status.cancelFn() return } @@ -813,15 +978,13 @@ func calculateEffectiveGracePeriod(status *podSyncStatus, pod *v1.Pod, options * // allowPodStart tries to start the pod and returns true if allowed, otherwise // it requeues the pod and returns false. If the pod will never be able to start // because data is missing, or the pod was terminated before start, canEverStart -// is false. +// is false. This method can only be called while holding the pod lock. func (p *podWorkers) allowPodStart(pod *v1.Pod) (canStart bool, canEverStart bool) { if !kubetypes.IsStaticPod(pod) { // TODO: Do we want to allow non-static pods with the same full name? // Note that it may disable the force deletion of pods. return true, true } - p.podLock.Lock() - defer p.podLock.Unlock() status, ok := p.podSyncStatuses[pod.UID] if !ok { klog.ErrorS(nil, "Pod sync status does not exist, the worker should not be running", "pod", klog.KObj(pod), "podUID", pod.UID) @@ -832,7 +995,6 @@ func (p *podWorkers) allowPodStart(pod *v1.Pod) (canStart bool, canEverStart boo } if !p.allowStaticPodStart(status.fullname, pod.UID) { p.workQueue.Enqueue(pod.UID, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) - status.working = false return false, true } return true, true @@ -874,32 +1036,156 @@ func (p *podWorkers) allowStaticPodStart(fullname string, uid types.UID) bool { return true } -func (p *podWorkers) managePodLoop(podUpdates <-chan podWork) { - var lastSyncTime time.Time - var podStarted bool - for update := range podUpdates { - pod := update.Options.Pod +// cleanupUnstartedPod is invoked if a pod that has never been started receives a termination +// signal before it can be started. This method must be called holding the pod lock. +func (p *podWorkers) cleanupUnstartedPod(pod *v1.Pod, status *podSyncStatus) { + p.cleanupPodUpdates(pod.UID) - // Decide whether to start the pod. If the pod was terminated prior to the pod being allowed - // to start, we have to clean it up and then exit the pod worker loop. - if !podStarted { - canStart, canEverStart := p.allowPodStart(pod) - if !canEverStart { - p.completeUnstartedTerminated(pod) - if start := update.Options.StartTime; !start.IsZero() { - metrics.PodWorkerDuration.WithLabelValues("terminated").Observe(metrics.SinceInSeconds(start)) - } - klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) - return - } - if !canStart { - klog.V(4).InfoS("Pod cannot start yet", "pod", klog.KObj(pod), "podUID", pod.UID) - continue - } - podStarted = true + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) + } + if !status.terminatedAt.IsZero() { + klog.V(4).InfoS("Pod worker is complete and had terminatedAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) + } + status.finished = true + status.working = false + status.terminatedAt = p.clock.Now() + + if p.startedStaticPodsByFullname[status.fullname] == pod.UID { + delete(p.startedStaticPodsByFullname, status.fullname) + } +} + +// startPodSync is invoked by each pod worker goroutine when a message arrives on the pod update channel. +// This method consumes a pending update, initializes a context, decides whether the pod is already started +// or can be started, and updates the cached pod state so that downstream components can observe what the +// pod worker goroutine is currently attempting to do. If ok is false, there is no available event. If any +// of the boolean values is false, ensure the appropriate cleanup happens before returning. +func (p *podWorkers) startPodSync(podUID types.UID) (ctx context.Context, update podWork, canStart, canEverStart, ok bool) { + p.podLock.Lock() + defer p.podLock.Unlock() + + // verify we are known to the pod worker still + status, ok := p.podSyncStatuses[podUID] + if !ok { + // pod status has disappeared, the worker should exit + klog.V(4).InfoS("Pod worker no longer has status, worker should exit", "podUID", podUID) + return nil, update, false, false, false + } + if !status.working { + // working is used by unit tests to observe whether a worker is currently acting on this pod + klog.V(4).InfoS("Pod should be marked as working by the pod worker, programmer error", "podUID", podUID) + } + if status.pendingUpdate == nil { + // no update available, this means we were queued without work being added or there is a + // race condition, both of which are unexpected + status.working = false + klog.V(4).InfoS("Pod worker received no pending work, programmer error?", "podUID", podUID) + return nil, update, false, false, false + } + + // consume the pending update + update.WorkType = status.WorkType() + update.Options = *status.pendingUpdate + status.pendingUpdate = nil + select { + case <-p.podUpdates[podUID]: + // ensure the pod update channel is empty (it is only ever written to under lock) + default: + } + + // initialize a context for the worker if one does not exist + if status.ctx == nil || status.ctx.Err() == context.Canceled { + status.ctx, status.cancelFn = context.WithCancel(context.Background()) + } + ctx = status.ctx + + // if we are already started, make our state visible to downstream components + if status.IsStarted() { + status.mergeLastUpdate(update.Options) + return ctx, update, true, true, true + } + + // if we are already terminating and we only have a running pod, allow the worker + // to "start" since we are immediately moving to terminating + if update.Options.RunningPod != nil && update.WorkType == TerminatingPod { + status.mergeLastUpdate(update.Options) + return ctx, update, true, true, true + } + + // If we receive an update where Pod is nil (running pod is set) but haven't + // started yet, we can only terminate the pod, not start it. We should not be + // asked to start such a pod, but guard here just in case an accident occurs. + if update.Options.Pod == nil { + status.mergeLastUpdate(update.Options) + klog.V(4).InfoS("Running pod cannot start ever, programmer error", "pod", klog.KObj(update.Options.Pod), "podUID", podUID, "updateType", update.WorkType) + return ctx, update, false, false, true + } + + // verify we can start + canStart, canEverStart = p.allowPodStart(update.Options.Pod) + switch { + case !canEverStart: + p.cleanupUnstartedPod(update.Options.Pod, status) + status.working = false + if start := update.Options.StartTime; !start.IsZero() { + metrics.PodWorkerDuration.WithLabelValues("terminated").Observe(metrics.SinceInSeconds(start)) + } + klog.V(4).InfoS("Pod cannot start ever", "pod", klog.KObj(update.Options.Pod), "podUID", podUID, "updateType", update.WorkType) + return ctx, update, canStart, canEverStart, true + case !canStart: + status.working = false + klog.V(4).InfoS("Pod cannot start yet", "pod", klog.KObj(update.Options.Pod), "podUID", podUID) + return ctx, update, canStart, canEverStart, true + } + + // mark the pod as started + status.startedAt = p.clock.Now() + status.mergeLastUpdate(update.Options) + + return ctx, update, true, true, true +} + +func podUIDAndRefForUpdate(update UpdatePodOptions) (types.UID, klog.ObjectRef) { + if update.RunningPod != nil { + return update.RunningPod.ID, klog.KObj(update.RunningPod.ToAPIPod()) + } + return update.Pod.UID, klog.KObj(update.Pod) +} + +// podWorkerLoop manages sequential state updates to a pod in a goroutine, exiting once the final +// state is reached. The loop is responsible for driving the pod through four main phases: +// +// 1. Wait to start, guaranteeing no two pods with the same UID or same fullname are running at the same time +// 2. Sync, orchestrating pod setup by reconciling the desired pod spec with the runtime state of the pod +// 3. Terminating, ensuring all running containers in the pod are stopped +// 4. Terminated, cleaning up any resources that must be released before the pod can be deleted +// +// The podWorkerLoop is driven by updates delivered to UpdatePod and by SyncKnownPods. If a particular +// sync method fails, p.workerQueue is updated with backoff but it is the responsibility of the kubelet +// to trigger new UpdatePod calls. SyncKnownPods will only retry pods that are no longer known to the +// caller. When a pod transitions working->terminating or terminating->terminated, the next update is +// queued immediately and no kubelet action is required. +func (p *podWorkers) podWorkerLoop(podUID types.UID, podUpdates <-chan struct{}) { + var lastSyncTime time.Time + for range podUpdates { + ctx, update, canStart, canEverStart, ok := p.startPodSync(podUID) + // If we had no update waiting, it means someone initialized the channel without filling out pendingUpdate. + if !ok { + continue + } + // If the pod was terminated prior to the pod being allowed to start, we exit the loop. + if !canEverStart { + return + } + // If the pod is not yet ready to start, continue and wait for more updates. + if !canStart { + continue } - klog.V(4).InfoS("Processing pod event", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + podUID, podRef := podUIDAndRefForUpdate(update.Options) + + klog.V(4).InfoS("Processing pod event", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) var isTerminal bool err := func() error { // The worker is responsible for ensuring the sync method sees the appropriate @@ -911,7 +1197,8 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan podWork) { var err error switch { case update.Options.RunningPod != nil: - // when we receive a running pod, we don't need status at all + // when we receive a running pod, we don't need status at all because we are + // guaranteed to be terminating and we skip updates to the pod default: // wait until we see the next refresh from the PLEG via the cache (max 2s) // TODO: this adds ~1s of latency on all transitions from sync to terminating @@ -921,36 +1208,40 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan podWork) { // Improving this latency also reduces the possibility that a terminated // container's status is garbage collected before we have a chance to update the // API server (thus losing the exit code). - status, err = p.podCache.GetNewerThan(pod.UID, lastSyncTime) - } - if err != nil { - // This is the legacy event thrown by manage pod loop all other events are now dispatched - // from syncPodFn - p.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) - return err - } + status, err = p.podCache.GetNewerThan(update.Options.Pod.UID, lastSyncTime) - ctx := p.contextForWorker(pod.UID) + if err != nil { + // This is the legacy event thrown by manage pod loop all other events are now dispatched + // from syncPodFn + p.recorder.Eventf(update.Options.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) + return err + } + } // Take the appropriate action (illegal phases are prevented by UpdatePod) switch { - case update.WorkType == TerminatedPodWork: - err = p.syncTerminatedPodFn(ctx, pod, status) + case update.WorkType == TerminatedPod: + err = p.podSyncer.SyncTerminatedPod(ctx, update.Options.Pod, status) - case update.WorkType == TerminatingPodWork: + case update.WorkType == TerminatingPod: var gracePeriod *int64 if opt := update.Options.KillPodOptions; opt != nil { gracePeriod = opt.PodTerminationGracePeriodSecondsOverride } - podStatusFn := p.acknowledgeTerminating(pod) + podStatusFn := p.acknowledgeTerminating(podUID) - err = p.syncTerminatingPodFn(ctx, pod, status, update.Options.RunningPod, gracePeriod, podStatusFn) + // if we only have a running pod, terminate it directly + if update.Options.RunningPod != nil { + err = p.podSyncer.SyncTerminatingRuntimePod(ctx, update.Options.RunningPod) + } else { + err = p.podSyncer.SyncTerminatingPod(ctx, update.Options.Pod, status, gracePeriod, podStatusFn) + } default: - isTerminal, err = p.syncPodFn(ctx, update.Options.UpdateType, pod, update.Options.MirrorPod, status) + isTerminal, err = p.podSyncer.SyncPod(ctx, update.Options.UpdateType, update.Options.Pod, update.Options.MirrorPod, status) } - lastSyncTime = time.Now() + lastSyncTime = p.clock.Now() return err }() @@ -958,65 +1249,65 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan podWork) { switch { case err == context.Canceled: // when the context is cancelled we expect an update to already be queued - klog.V(2).InfoS("Sync exited with context cancellation error", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + klog.V(2).InfoS("Sync exited with context cancellation error", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) case err != nil: // we will queue a retry - klog.ErrorS(err, "Error syncing pod, skipping", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.ErrorS(err, "Error syncing pod, skipping", "pod", podRef, "podUID", podUID) - case update.WorkType == TerminatedPodWork: + case update.WorkType == TerminatedPod: // we can shut down the worker - p.completeTerminated(pod) + p.completeTerminated(podUID) if start := update.Options.StartTime; !start.IsZero() { metrics.PodWorkerDuration.WithLabelValues("terminated").Observe(metrics.SinceInSeconds(start)) } - klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + klog.V(4).InfoS("Processing pod event done", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) return - case update.WorkType == TerminatingPodWork: - // pods that don't exist in config don't need to be terminated, garbage collection will cover them + case update.WorkType == TerminatingPod: + // pods that don't exist in config don't need to be terminated, other loops will clean them up if update.Options.RunningPod != nil { - p.completeTerminatingRuntimePod(pod) + p.completeTerminatingRuntimePod(podUID) if start := update.Options.StartTime; !start.IsZero() { metrics.PodWorkerDuration.WithLabelValues(update.Options.UpdateType.String()).Observe(metrics.SinceInSeconds(start)) } - klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + klog.V(4).InfoS("Processing pod event done", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) return } // otherwise we move to the terminating phase - p.completeTerminating(pod) + p.completeTerminating(podUID) phaseTransition = true case isTerminal: // if syncPod indicated we are now terminal, set the appropriate pod status to move to terminating - klog.V(4).InfoS("Pod is terminal", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) - p.completeSync(pod) + klog.V(4).InfoS("Pod is terminal", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) + p.completeSync(podUID) phaseTransition = true } // queue a retry if necessary, then put the next event in the channel if any - p.completeWork(pod, phaseTransition, err) + p.completeWork(podUID, phaseTransition, err) if start := update.Options.StartTime; !start.IsZero() { metrics.PodWorkerDuration.WithLabelValues(update.Options.UpdateType.String()).Observe(metrics.SinceInSeconds(start)) } - klog.V(4).InfoS("Processing pod event done", "pod", klog.KObj(pod), "podUID", pod.UID, "updateType", update.WorkType) + klog.V(4).InfoS("Processing pod event done", "pod", podRef, "podUID", podUID, "updateType", update.WorkType) } } // acknowledgeTerminating sets the terminating flag on the pod status once the pod worker sees // the termination state so that other components know no new containers will be started in this // pod. It then returns the status function, if any, that applies to this pod. -func (p *podWorkers) acknowledgeTerminating(pod *v1.Pod) PodStatusFunc { +func (p *podWorkers) acknowledgeTerminating(podUID types.UID) PodStatusFunc { p.podLock.Lock() defer p.podLock.Unlock() - status, ok := p.podSyncStatuses[pod.UID] + status, ok := p.podSyncStatuses[podUID] if !ok { return nil } if !status.terminatingAt.IsZero() && !status.startedTerminating { - klog.V(4).InfoS("Pod worker has observed request to terminate", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod worker has observed request to terminate", "podUID", podUID) status.startedTerminating = true } @@ -1030,196 +1321,168 @@ func (p *podWorkers) acknowledgeTerminating(pod *v1.Pod) PodStatusFunc { // be terminated. This happens when the natural pod lifecycle completes - any pod which is not RestartAlways // exits. Unnatural completions, such as evictions, API driven deletion or phase transition, are handled by // UpdatePod. -func (p *podWorkers) completeSync(pod *v1.Pod) { +func (p *podWorkers) completeSync(podUID types.UID) { p.podLock.Lock() defer p.podLock.Unlock() - klog.V(4).InfoS("Pod indicated lifecycle completed naturally and should now terminate", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod indicated lifecycle completed naturally and should now terminate", "podUID", podUID) - if status, ok := p.podSyncStatuses[pod.UID]; ok { - if status.terminatingAt.IsZero() { - status.terminatingAt = time.Now() - } else { - klog.V(4).InfoS("Pod worker attempted to set terminatingAt twice, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - status.startedTerminating = true + status, ok := p.podSyncStatuses[podUID] + if !ok { + klog.V(4).InfoS("Pod had no status in completeSync, programmer error?", "podUID", podUID) + return } - p.lastUndeliveredWorkUpdate[pod.UID] = podWork{ - WorkType: TerminatingPodWork, - Options: UpdatePodOptions{ - Pod: pod, - }, + // update the status of the pod + if status.terminatingAt.IsZero() { + status.terminatingAt = p.clock.Now() + } else { + klog.V(4).InfoS("Pod worker attempted to set terminatingAt twice, likely programmer error", "podUID", podUID) } + status.startedTerminating = true + + // the pod has now transitioned to terminating and we want to run syncTerminatingPod + // as soon as possible, so if no update is already waiting queue a synthetic update + p.requeueLastPodUpdate(podUID, status) } // completeTerminating is invoked when syncTerminatingPod completes successfully, which means // no container is running, no container will be started in the future, and we are ready for // cleanup. This updates the termination state which prevents future syncs and will ensure // other kubelet loops know this pod is not running any containers. -func (p *podWorkers) completeTerminating(pod *v1.Pod) { +func (p *podWorkers) completeTerminating(podUID types.UID) { p.podLock.Lock() defer p.podLock.Unlock() - klog.V(4).InfoS("Pod terminated all containers successfully", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod terminated all containers successfully", "podUID", podUID) - if status, ok := p.podSyncStatuses[pod.UID]; ok { - if status.terminatingAt.IsZero() { - klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - status.terminatedAt = time.Now() - for _, ch := range status.notifyPostTerminating { - close(ch) - } - status.notifyPostTerminating = nil - status.statusPostTerminating = nil + status, ok := p.podSyncStatuses[podUID] + if !ok { + return } - p.lastUndeliveredWorkUpdate[pod.UID] = podWork{ - WorkType: TerminatedPodWork, - Options: UpdatePodOptions{ - Pod: pod, - }, + // update the status of the pod + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "podUID", podUID) } + status.terminatedAt = p.clock.Now() + for _, ch := range status.notifyPostTerminating { + close(ch) + } + status.notifyPostTerminating = nil + status.statusPostTerminating = nil + + // the pod has now transitioned to terminated and we want to run syncTerminatedPod + // as soon as possible, so if no update is already waiting queue a synthetic update + p.requeueLastPodUpdate(podUID, status) } // completeTerminatingRuntimePod is invoked when syncTerminatingPod completes successfully, // which means an orphaned pod (no config) is terminated and we can exit. Since orphaned -// pods have no API representation, we want to exit the loop at this point -// cleanup. This updates the termination state which prevents future syncs and will ensure -// other kubelet loops know this pod is not running any containers. -func (p *podWorkers) completeTerminatingRuntimePod(pod *v1.Pod) { +// pods have no API representation, we want to exit the loop at this point and ensure no +// status is present afterwards - the running pod is truly terminated when this is invoked. +func (p *podWorkers) completeTerminatingRuntimePod(podUID types.UID) { p.podLock.Lock() defer p.podLock.Unlock() - klog.V(4).InfoS("Pod terminated all orphaned containers successfully and worker can now stop", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod terminated all orphaned containers successfully and worker can now stop", "podUID", podUID) - if status, ok := p.podSyncStatuses[pod.UID]; ok { - if status.terminatingAt.IsZero() { - klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - status.terminatedAt = time.Now() - status.finished = true - status.working = false + p.cleanupPodUpdates(podUID) - if p.startedStaticPodsByFullname[status.fullname] == pod.UID { - delete(p.startedStaticPodsByFullname, status.fullname) - } + status, ok := p.podSyncStatuses[podUID] + if !ok { + return + } + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker was terminated but did not have terminatingAt set, likely programmer error", "podUID", podUID) + } + status.terminatedAt = p.clock.Now() + status.finished = true + status.working = false + + if p.startedStaticPodsByFullname[status.fullname] == podUID { + delete(p.startedStaticPodsByFullname, status.fullname) } - p.cleanupPodUpdates(pod.UID) + // A runtime pod is transient and not part of the desired state - once it has reached + // terminated we can abandon tracking it. + delete(p.podSyncStatuses, podUID) } // completeTerminated is invoked after syncTerminatedPod completes successfully and means we // can stop the pod worker. The pod is finalized at this point. -func (p *podWorkers) completeTerminated(pod *v1.Pod) { +func (p *podWorkers) completeTerminated(podUID types.UID) { p.podLock.Lock() defer p.podLock.Unlock() - klog.V(4).InfoS("Pod is complete and the worker can now stop", "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(4).InfoS("Pod is complete and the worker can now stop", "podUID", podUID) - p.cleanupPodUpdates(pod.UID) + p.cleanupPodUpdates(podUID) - if status, ok := p.podSyncStatuses[pod.UID]; ok { - if status.terminatingAt.IsZero() { - klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - if status.terminatedAt.IsZero() { - klog.V(4).InfoS("Pod worker is complete but did not have terminatedAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - status.finished = true - status.working = false - - if p.startedStaticPodsByFullname[status.fullname] == pod.UID { - delete(p.startedStaticPodsByFullname, status.fullname) - } + status, ok := p.podSyncStatuses[podUID] + if !ok { + return } -} + if status.terminatingAt.IsZero() { + klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "podUID", podUID) + } + if status.terminatedAt.IsZero() { + klog.V(4).InfoS("Pod worker is complete but did not have terminatedAt set, likely programmer error", "podUID", podUID) + } + status.finished = true + status.working = false -// completeUnstartedTerminated is invoked if a pod that has never been started receives a termination -// signal before it can be started. -func (p *podWorkers) completeUnstartedTerminated(pod *v1.Pod) { - p.podLock.Lock() - defer p.podLock.Unlock() - - klog.V(4).InfoS("Pod never started and the worker can now stop", "pod", klog.KObj(pod), "podUID", pod.UID) - - p.cleanupPodUpdates(pod.UID) - - if status, ok := p.podSyncStatuses[pod.UID]; ok { - if status.terminatingAt.IsZero() { - klog.V(4).InfoS("Pod worker is complete but did not have terminatingAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - if !status.terminatedAt.IsZero() { - klog.V(4).InfoS("Pod worker is complete and had terminatedAt set, likely programmer error", "pod", klog.KObj(pod), "podUID", pod.UID) - } - status.finished = true - status.working = false - status.terminatedAt = time.Now() - - if p.startedStaticPodsByFullname[status.fullname] == pod.UID { - delete(p.startedStaticPodsByFullname, status.fullname) - } + if p.startedStaticPodsByFullname[status.fullname] == podUID { + delete(p.startedStaticPodsByFullname, status.fullname) } } // completeWork requeues on error or the next sync interval and then immediately executes any pending // work. -func (p *podWorkers) completeWork(pod *v1.Pod, phaseTransition bool, syncErr error) { +func (p *podWorkers) completeWork(podUID types.UID, phaseTransition bool, syncErr error) { // Requeue the last update if the last sync returned error. switch { case phaseTransition: - p.workQueue.Enqueue(pod.UID, 0) + p.workQueue.Enqueue(podUID, 0) case syncErr == nil: // No error; requeue at the regular resync interval. - p.workQueue.Enqueue(pod.UID, wait.Jitter(p.resyncInterval, workerResyncIntervalJitterFactor)) + p.workQueue.Enqueue(podUID, wait.Jitter(p.resyncInterval, workerResyncIntervalJitterFactor)) case strings.Contains(syncErr.Error(), NetworkNotReadyErrorMsg): // Network is not ready; back off for short period of time and retry as network might be ready soon. - p.workQueue.Enqueue(pod.UID, wait.Jitter(backOffOnTransientErrorPeriod, workerBackOffPeriodJitterFactor)) + p.workQueue.Enqueue(podUID, wait.Jitter(backOffOnTransientErrorPeriod, workerBackOffPeriodJitterFactor)) default: // Error occurred during the sync; back off and then retry. - p.workQueue.Enqueue(pod.UID, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) + p.workQueue.Enqueue(podUID, wait.Jitter(p.backOffPeriod, workerBackOffPeriodJitterFactor)) } - p.completeWorkQueueNext(pod.UID) -} -// completeWorkQueueNext holds the lock and either queues the next work item for the worker or -// clears the working status. -func (p *podWorkers) completeWorkQueueNext(uid types.UID) { + // if there is a pending update for this worker, requeue immediately, otherwise + // clear working status p.podLock.Lock() defer p.podLock.Unlock() - if workUpdate, exists := p.lastUndeliveredWorkUpdate[uid]; exists { - p.podUpdates[uid] <- workUpdate - delete(p.lastUndeliveredWorkUpdate, uid) - } else { - p.podSyncStatuses[uid].working = false + if status, ok := p.podSyncStatuses[podUID]; ok { + if status.pendingUpdate != nil { + select { + case p.podUpdates[podUID] <- struct{}{}: + klog.V(4).InfoS("Requeueing pod due to pending update", "podUID", podUID) + default: + klog.V(4).InfoS("Pending update already queued", "podUID", podUID) + } + } else { + status.working = false + } } } -// contextForWorker returns or initializes the appropriate context for a known -// worker. If the current context is expired, it is reset. If no worker is -// present, no context is returned. -func (p *podWorkers) contextForWorker(uid types.UID) context.Context { - p.podLock.Lock() - defer p.podLock.Unlock() - - status, ok := p.podSyncStatuses[uid] - if !ok { - return nil - } - if status.ctx == nil || status.ctx.Err() == context.Canceled { - status.ctx, status.cancelFn = context.WithCancel(context.Background()) - } - return status.ctx -} - // SyncKnownPods will purge any fully terminated pods that are not in the desiredPods // list, which means SyncKnownPods must be called in a threadsafe manner from calls -// to UpdatePods for new pods. It returns a map of known workers that are not finished -// with a value of SyncPodTerminated, SyncPodKill, or SyncPodSync depending on whether -// the pod is terminated, terminating, or syncing. -func (p *podWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkerState { - workers := make(map[types.UID]PodWorkerState) +// to UpdatePods for new pods. Because the podworker is dependent on UpdatePod being +// invoked to drive a pod's state machine, if a pod is missing in the desired list the +// pod worker must be responsible for delivering that update. The method returns a map +// of known workers that are not finished with a value of SyncPodTerminated, +// SyncPodKill, or SyncPodSync depending on whether the pod is terminated, terminating, +// or syncing. +func (p *podWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkerSync { + workers := make(map[types.UID]PodWorkerSync) known := make(map[types.UID]struct{}) for _, pod := range desiredPods { known[pod.UID] = struct{}{} @@ -1230,21 +1493,38 @@ func (p *podWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorke p.podsSynced = true for uid, status := range p.podSyncStatuses { - if _, exists := known[uid]; !exists || status.restartRequested { - p.removeTerminatedWorker(uid) - } - switch { - case !status.terminatedAt.IsZero(): - if status.restartRequested { - workers[uid] = TerminatedAndRecreatedPod - } else { - workers[uid] = TerminatedPod + // We retain the worker history of any pod that is still desired according to + // its UID. However, there are ]two scenarios during a sync that result in us + // needing to purge the history: + // + // 1. The pod is no longer desired (the local version is orphaned) + // 2. The pod received a kill update and then a subsequent create, which means + // the UID was reused in the source config (vanishingly rare for API servers, + // common for static pods that have specified a fixed UID) + // + // In the former case we wish to bound the amount of information we store for + // deleted pods. In the latter case we wish to minimize the amount of time before + // we restart the static pod. If we succeed at removing the worker, then we + // omit it from the returned map of known workers, and the caller of SyncKnownPods + // is expected to send a new UpdatePod({UpdateType: Create}). + _, knownPod := known[uid] + orphan := !knownPod + if status.restartRequested || orphan { + if p.removeTerminatedWorker(uid, status, orphan) { + // no worker running, we won't return it + continue } - case !status.terminatingAt.IsZero(): - workers[uid] = TerminatingPod - default: - workers[uid] = SyncPod } + + sync := PodWorkerSync{ + State: status.WorkType(), + Orphan: orphan, + } + if status.activeUpdate != nil && status.activeUpdate.Pod != nil { + sync.HasConfig = true + sync.Static = kubetypes.IsStaticPod(status.activeUpdate.Pod) + } + workers[uid] = sync } return workers } @@ -1252,18 +1532,50 @@ func (p *podWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorke // removeTerminatedWorker cleans up and removes the worker status for a worker // that has reached a terminal state of "finished" - has successfully exited // syncTerminatedPod. This "forgets" a pod by UID and allows another pod to be -// recreated with the same UID. -func (p *podWorkers) removeTerminatedWorker(uid types.UID) { - status, ok := p.podSyncStatuses[uid] - if !ok { - // already forgotten, or forgotten too early - klog.V(4).InfoS("Pod worker has been requested for removal but is not a known pod", "podUID", uid) - return - } - +// recreated with the same UID. The kubelet preserves state about recently +// terminated pods to prevent accidentally restarting a terminal pod, which is +// proportional to the number of pods described in the pod config. The method +// returns true if the worker was completely removed. +func (p *podWorkers) removeTerminatedWorker(uid types.UID, status *podSyncStatus, orphaned bool) bool { if !status.finished { - klog.V(4).InfoS("Pod worker has been requested for removal but is still not fully terminated", "podUID", uid) - return + // If the pod worker has not reached terminal state and the pod is still known, we wait. + if !orphaned { + klog.V(4).InfoS("Pod worker has been requested for removal but is still not fully terminated", "podUID", uid) + return false + } + + // all orphaned pods are considered deleted + status.deleted = true + + // When a pod is no longer in the desired set, the pod is considered orphaned and the + // the pod worker becomes responsible for driving the pod to completion (there is no + // guarantee another component will notify us of updates). + switch { + case !status.IsStarted() && !status.observedRuntime: + // The pod has not been started, which means we can safely clean up the pod - the + // pod worker will shutdown as a result of this change without executing a sync. + klog.V(4).InfoS("Pod is orphaned and has not been started", "podUID", uid) + case !status.IsTerminationRequested(): + // The pod has been started but termination has not been requested - set the appropriate + // timestamp and notify the pod worker. Because the pod has been synced at least once, + // the value of status.activeUpdate will be the fallback for the next sync. + status.terminatingAt = p.clock.Now() + if status.activeUpdate != nil && status.activeUpdate.Pod != nil { + status.gracePeriod, _ = calculateEffectiveGracePeriod(status, status.activeUpdate.Pod, nil) + } else { + status.gracePeriod = 1 + } + p.requeueLastPodUpdate(uid, status) + klog.V(4).InfoS("Pod is orphaned and still running, began terminating", "podUID", uid) + return false + default: + // The pod is already moving towards termination, notify the pod worker. Because the pod + // has been synced at least once, the value of status.activeUpdate will be the fallback for + // the next sync. + p.requeueLastPodUpdate(uid, status) + klog.V(4).InfoS("Pod is orphaned and still terminating, notified the pod worker", "podUID", uid) + return false + } } if status.restartRequested { @@ -1277,6 +1589,7 @@ func (p *podWorkers) removeTerminatedWorker(uid types.UID) { if p.startedStaticPodsByFullname[status.fullname] == uid { delete(p.startedStaticPodsByFullname, status.fullname) } + return true } // killPodNow returns a KillPodFunc that can be used to kill a pod. @@ -1333,5 +1646,25 @@ func (p *podWorkers) cleanupPodUpdates(uid types.UID) { close(ch) } delete(p.podUpdates, uid) - delete(p.lastUndeliveredWorkUpdate, uid) +} + +// requeueLastPodUpdate creates a new pending pod update from the most recently +// executed update if no update is already queued, and then notifies the pod +// worker goroutine of the update. This method must be called while holding +// the pod lock. +func (p *podWorkers) requeueLastPodUpdate(podUID types.UID, status *podSyncStatus) { + // if there is already an update queued, we can use that instead, or if + // we have no previously executed update, we cannot replay it. + if status.pendingUpdate != nil || status.activeUpdate == nil { + return + } + copied := *status.activeUpdate + status.pendingUpdate = &copied + + // notify the pod worker + status.working = true + select { + case p.podUpdates[podUID] <- struct{}{}: + default: + } } diff --git a/pkg/kubelet/pod_workers_test.go b/pkg/kubelet/pod_workers_test.go index c06d77bb592..32c8b3ce38b 100644 --- a/pkg/kubelet/pod_workers_test.go +++ b/pkg/kubelet/pod_workers_test.go @@ -25,6 +25,8 @@ import ( "time" "github.com/google/go-cmp/cmp" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -35,6 +37,7 @@ import ( kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/queue" "k8s.io/utils/clock" + clocktesting "k8s.io/utils/clock/testing" ) // fakePodWorkers runs sync pod function in serial, so we can have @@ -88,8 +91,8 @@ func (f *fakePodWorkers) UpdatePod(options UpdatePodOptions) { } } -func (f *fakePodWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkerState { - return nil +func (f *fakePodWorkers) SyncKnownPods(desiredPods []*v1.Pod) map[types.UID]PodWorkerSync { + return map[types.UID]PodWorkerSync{} } func (f *fakePodWorkers) IsPodKnownTerminated(uid types.UID) bool { @@ -137,28 +140,14 @@ type TestingInterface interface { Errorf(format string, args ...interface{}) } -func newPod(uid, name string) *v1.Pod { - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - UID: types.UID(uid), - Name: name, - }, - } -} - func newPodWithPhase(uid, name string, phase v1.PodPhase) *v1.Pod { - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - UID: types.UID(uid), - Name: name, - }, - Status: v1.PodStatus{ - Phase: phase, - }, - } + pod := newNamedPod(uid, "ns", name, false) + pod.Status.Phase = phase + return pod } func newStaticPod(uid, name string) *v1.Pod { + thirty := int64(30) return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ UID: types.UID(uid), @@ -167,16 +156,23 @@ func newStaticPod(uid, name string) *v1.Pod { kubetypes.ConfigSourceAnnotationKey: kubetypes.FileSource, }, }, + Spec: v1.PodSpec{ + TerminationGracePeriodSeconds: &thirty, + }, } } func newNamedPod(uid, namespace, name string, isStatic bool) *v1.Pod { + thirty := int64(30) pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ UID: types.UID(uid), Namespace: namespace, Name: name, }, + Spec: v1.PodSpec{ + TerminationGracePeriodSeconds: &thirty, + }, } if isStatic { pod.Annotations = map[string]string{ @@ -188,10 +184,11 @@ func newNamedPod(uid, namespace, name string, isStatic bool) *v1.Pod { // syncPodRecord is a record of a sync pod call type syncPodRecord struct { - name string - updateType kubetypes.SyncPodType - runningPod *kubecontainer.Pod - terminated bool + name string + updateType kubetypes.SyncPodType + runningPod *kubecontainer.Pod + terminated bool + gracePeriod *int64 } type FakeQueueItem struct { @@ -244,48 +241,204 @@ func (q *fakeQueue) GetWork() []types.UID { return work } -func createPodWorkers() (*podWorkers, map[types.UID][]syncPodRecord) { +type timeIncrementingWorkers struct { + lock sync.Mutex + w *podWorkers + runtime *containertest.FakeRuntime + holds map[types.UID]chan struct{} +} + +// UpdatePod increments the clock after UpdatePod is called, but before the workers +// are invoked, and then drains all workers before returning. The provided functions +// are invoked while holding the lock to prevent workers from receiving updates. +func (w *timeIncrementingWorkers) UpdatePod(options UpdatePodOptions, afterFns ...func()) { + func() { + w.lock.Lock() + defer w.lock.Unlock() + w.w.UpdatePod(options) + w.w.clock.(*clocktesting.FakePassiveClock).SetTime(w.w.clock.Now().Add(time.Second)) + for _, fn := range afterFns { + fn() + } + }() + w.drainUnpausedWorkers() +} + +// SyncKnownPods increments the clock after SyncKnownPods is called, but before the workers +// are invoked, and then drains all workers before returning. +func (w *timeIncrementingWorkers) SyncKnownPods(desiredPods []*v1.Pod) (knownPods map[types.UID]PodWorkerSync) { + func() { + w.lock.Lock() + defer w.lock.Unlock() + knownPods = w.w.SyncKnownPods(desiredPods) + w.w.clock.(*clocktesting.FakePassiveClock).SetTime(w.w.clock.Now().Add(time.Second)) + }() + w.drainUnpausedWorkers() + return +} + +func (w *timeIncrementingWorkers) PauseWorkers(uids ...types.UID) { + w.lock.Lock() + defer w.lock.Unlock() + if w.holds == nil { + w.holds = make(map[types.UID]chan struct{}) + } + for _, uid := range uids { + if _, ok := w.holds[uid]; !ok { + w.holds[uid] = make(chan struct{}) + } + } +} + +func (w *timeIncrementingWorkers) ReleaseWorkers(uids ...types.UID) { + w.lock.Lock() + defer w.lock.Unlock() + w.ReleaseWorkersUnderLock(uids...) +} + +func (w *timeIncrementingWorkers) ReleaseWorkersUnderLock(uids ...types.UID) { + for _, uid := range uids { + if ch, ok := w.holds[uid]; ok { + close(ch) + delete(w.holds, uid) + } + } +} + +func (w *timeIncrementingWorkers) waitForPod(uid types.UID) { + w.lock.Lock() + ch, ok := w.holds[uid] + w.lock.Unlock() + if !ok { + return + } + <-ch +} + +func (w *timeIncrementingWorkers) drainUnpausedWorkers() { + pausedWorkers := make(map[types.UID]struct{}) + for { + for uid := range pausedWorkers { + delete(pausedWorkers, uid) + } + stillWorking := false + + // ignore held workers + w.lock.Lock() + for uid := range w.holds { + pausedWorkers[uid] = struct{}{} + } + w.lock.Unlock() + + // check for at least one still working non-paused worker + w.w.podLock.Lock() + for uid, worker := range w.w.podSyncStatuses { + if _, ok := pausedWorkers[uid]; ok { + continue + } + if worker.working { + stillWorking = true + break + } + } + w.w.podLock.Unlock() + + if !stillWorking { + break + } + time.Sleep(time.Millisecond) + } +} + +func (w *timeIncrementingWorkers) tick() { + w.lock.Lock() + defer w.lock.Unlock() + w.w.clock.(*clocktesting.FakePassiveClock).SetTime(w.w.clock.Now().Add(time.Second)) +} + +// createTimeIncrementingPodWorkers will guarantee that each call to UpdatePod and each worker goroutine invocation advances the clock by one second, +// although multiple workers will advance the clock in an unpredictable order. Use to observe +// successive internal updates to each update pod state when only a single pod is being updated. +func createTimeIncrementingPodWorkers() (*timeIncrementingWorkers, map[types.UID][]syncPodRecord) { + nested, runtime, processed := createPodWorkers() + w := &timeIncrementingWorkers{ + w: nested, + runtime: runtime, + } + nested.workerChannelFn = func(uid types.UID, in chan struct{}) <-chan struct{} { + ch := make(chan struct{}) + go func() { + defer close(ch) + // TODO: this is an eager loop, we might want to lazily read from in only once + // ch is empty + for range in { + w.waitForPod(uid) + w.tick() + ch <- struct{}{} + } + }() + return ch + } + return w, processed +} + +func createPodWorkers() (*podWorkers, *containertest.FakeRuntime, map[types.UID][]syncPodRecord) { lock := sync.Mutex{} processed := make(map[types.UID][]syncPodRecord) fakeRecorder := &record.FakeRecorder{} fakeRuntime := &containertest.FakeRuntime{} fakeCache := containertest.NewFakeCache(fakeRuntime) fakeQueue := &fakeQueue{} + clock := clocktesting.NewFakePassiveClock(time.Unix(1, 0)) w := newPodWorkers( - func(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { - func() { - lock.Lock() - defer lock.Unlock() - pod := pod - processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ - name: pod.Name, - updateType: updateType, - }) - }() - return false, nil - }, - func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { - func() { - lock.Lock() - defer lock.Unlock() - processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ - name: pod.Name, - updateType: kubetypes.SyncPodKill, - runningPod: runningPod, - }) - }() - return nil - }, - func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { - func() { - lock.Lock() - defer lock.Unlock() - processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ - name: pod.Name, - terminated: true, - }) - }() - return nil + &podSyncerFuncs{ + syncPod: func(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { + func() { + lock.Lock() + defer lock.Unlock() + pod := pod + processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ + name: pod.Name, + updateType: updateType, + }) + }() + return false, nil + }, + syncTerminatingPod: func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { + func() { + lock.Lock() + defer lock.Unlock() + processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ + name: pod.Name, + updateType: kubetypes.SyncPodKill, + gracePeriod: gracePeriod, + }) + }() + return nil + }, + syncTerminatingRuntimePod: func(ctx context.Context, runningPod *kubecontainer.Pod) error { + func() { + lock.Lock() + defer lock.Unlock() + processed[runningPod.ID] = append(processed[runningPod.ID], syncPodRecord{ + name: runningPod.Name, + updateType: kubetypes.SyncPodKill, + runningPod: runningPod, + }) + }() + return nil + }, + syncTerminatedPod: func(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { + func() { + lock.Lock() + defer lock.Unlock() + processed[pod.UID] = append(processed[pod.UID], syncPodRecord{ + name: pod.Name, + terminated: true, + }) + }() + return nil + }, }, fakeRecorder, fakeQueue, @@ -293,7 +446,9 @@ func createPodWorkers() (*podWorkers, map[types.UID][]syncPodRecord) { time.Millisecond, fakeCache, ) - return w.(*podWorkers), processed + workers := w.(*podWorkers) + workers.clock = clock + return workers, fakeRuntime, processed } func drainWorkers(podWorkers *podWorkers, numPods int) { @@ -357,14 +512,14 @@ func drainAllWorkers(podWorkers *podWorkers) { } } -func TestUpdatePod(t *testing.T) { - podWorkers, processed := createPodWorkers() +func TestUpdatePodParallel(t *testing.T) { + podWorkers, _, processed := createPodWorkers() numPods := 20 for i := 0; i < numPods; i++ { for j := i; j < numPods; j++ { podWorkers.UpdatePod(UpdatePodOptions{ - Pod: newPod(strconv.Itoa(j), strconv.Itoa(i)), + Pod: newNamedPod(strconv.Itoa(j), "ns", strconv.Itoa(i), false), UpdateType: kubetypes.SyncPodCreate, }) } @@ -372,61 +527,337 @@ func TestUpdatePod(t *testing.T) { drainWorkers(podWorkers, numPods) if len(processed) != numPods { - t.Errorf("Not all pods processed: %v", len(processed)) - return + t.Fatalf("Not all pods processed: %v", len(processed)) } for i := 0; i < numPods; i++ { uid := types.UID(strconv.Itoa(i)) - if len(processed[uid]) < 1 || len(processed[uid]) > i+1 { - t.Errorf("Pod %v processed %v times", i, len(processed[uid])) + events := processed[uid] + if len(events) < 1 || len(events) > i+1 { + t.Errorf("Pod %v processed %v times", i, len(events)) continue } - // PodWorker guarantees the first and the last event will be processed - first := 0 - last := len(processed[uid]) - 1 - if processed[uid][first].name != "0" { - t.Errorf("Pod %v: incorrect order %v, %v", i, first, processed[uid][first]) - - } - if processed[uid][last].name != strconv.Itoa(i) { - t.Errorf("Pod %v: incorrect order %v, %v", i, last, processed[uid][last]) + // PodWorker guarantees the last event will be processed + last := len(events) - 1 + if events[last].name != strconv.Itoa(i) { + t.Errorf("Pod %v: incorrect order %v, %#v", i, last, events) } } } -func TestUpdatePodWithTerminatedPod(t *testing.T) { - podWorkers, _ := createPodWorkers() - terminatedPod := newPodWithPhase("0000-0000-0000", "done-pod", v1.PodSucceeded) - orphanedPod := &kubecontainer.Pod{ID: "0000-0000-0001", Name: "orphaned-pod"} - pod := newPod("0000-0000-0002", "running-pod") +func TestUpdatePod(t *testing.T) { + one := int64(1) + hasContext := func(status *podSyncStatus) *podSyncStatus { + status.ctx, status.cancelFn = context.Background(), func() {} + return status + } + withLabel := func(pod *v1.Pod, label, value string) *v1.Pod { + if pod.Labels == nil { + pod.Labels = make(map[string]string) + } + pod.Labels[label] = value + return pod + } + withDeletionTimestamp := func(pod *v1.Pod, ts time.Time, gracePeriod *int64) *v1.Pod { + pod.DeletionTimestamp = &metav1.Time{Time: ts} + pod.DeletionGracePeriodSeconds = gracePeriod + return pod + } + intp := func(i int64) *int64 { + return &i + } + expectPodSyncStatus := func(t *testing.T, expected, status *podSyncStatus) { + t.Helper() + // handle special non-comparable fields + if status != nil { + if e, a := expected.ctx != nil, status.ctx != nil; e != a { + t.Errorf("expected context %t, has context %t", e, a) + } else { + expected.ctx, status.ctx = nil, nil + } + if e, a := expected.cancelFn != nil, status.cancelFn != nil; e != a { + t.Errorf("expected cancelFn %t, has cancelFn %t", e, a) + } else { + expected.cancelFn, status.cancelFn = nil, nil + } + } + if e, a := expected, status; !reflect.DeepEqual(e, a) { + t.Fatalf("unexpected status: %s", cmp.Diff(e, a, cmp.AllowUnexported(podSyncStatus{}))) + } + } + for _, tc := range []struct { + name string + update UpdatePodOptions + runtimeStatus *kubecontainer.PodStatus + prepare func(t *testing.T, w *timeIncrementingWorkers) (afterUpdateFn func()) - podWorkers.UpdatePod(UpdatePodOptions{ - Pod: terminatedPod, - UpdateType: kubetypes.SyncPodCreate, - }) - podWorkers.UpdatePod(UpdatePodOptions{ - Pod: pod, - UpdateType: kubetypes.SyncPodCreate, - }) - podWorkers.UpdatePod(UpdatePodOptions{ - UpdateType: kubetypes.SyncPodKill, - RunningPod: orphanedPod, - }) - drainAllWorkers(podWorkers) - if podWorkers.IsPodKnownTerminated(pod.UID) == true { - t.Errorf("podWorker state should not be terminated") - } - if podWorkers.IsPodKnownTerminated(terminatedPod.UID) == false { - t.Errorf("podWorker state should be terminated") - } - if podWorkers.IsPodKnownTerminated(orphanedPod.ID) == false { - t.Errorf("podWorker state should be terminated for orphaned pod") + expect *podSyncStatus + expectBeforeWorker *podSyncStatus + expectKnownTerminated bool + }{ + { + name: "a new pod is recorded and started", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newNamedPod("1", "ns", "running-pod", false), + }, + expect: hasContext(&podSyncStatus{ + fullname: "running-pod_ns", + syncedAt: time.Unix(1, 0), + startedAt: time.Unix(3, 0), + activeUpdate: &UpdatePodOptions{ + Pod: newNamedPod("1", "ns", "running-pod", false), + }, + }), + }, + { + name: "a new pod is recorded and started unless it is a duplicate of an existing terminating pod UID", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: withLabel(newNamedPod("1", "ns", "running-pod", false), "updated", "value"), + }, + prepare: func(t *testing.T, w *timeIncrementingWorkers) func() { + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newNamedPod("1", "ns", "running-pod", false), + }) + w.PauseWorkers("1") + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + Pod: newNamedPod("1", "ns", "running-pod", false), + }) + return func() { w.ReleaseWorkersUnderLock("1") } + }, + expect: hasContext(&podSyncStatus{ + fullname: "running-pod_ns", + syncedAt: time.Unix(1, 0), + startedAt: time.Unix(3, 0), + terminatingAt: time.Unix(3, 0), + terminatedAt: time.Unix(6, 0), + gracePeriod: 30, + startedTerminating: true, + restartRequested: true, // because we received a create during termination + finished: true, + activeUpdate: &UpdatePodOptions{ + Pod: newNamedPod("1", "ns", "running-pod", false), + KillPodOptions: &KillPodOptions{PodTerminationGracePeriodSecondsOverride: intp(30)}, + }, + }), + expectKnownTerminated: true, + }, + { + name: "a new pod is recorded and started and running pod is ignored", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newNamedPod("1", "ns", "running-pod", false), + RunningPod: &kubecontainer.Pod{ID: "1", Name: "orphaned-pod", Namespace: "ns"}, + }, + expect: hasContext(&podSyncStatus{ + fullname: "running-pod_ns", + syncedAt: time.Unix(1, 0), + startedAt: time.Unix(3, 0), + activeUpdate: &UpdatePodOptions{ + Pod: newNamedPod("1", "ns", "running-pod", false), + }, + }), + }, + { + name: "a running pod is terminated when an update contains a deletionTimestamp", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodUpdate, + Pod: withDeletionTimestamp(newNamedPod("1", "ns", "running-pod", false), time.Unix(1, 0), intp(15)), + }, + prepare: func(t *testing.T, w *timeIncrementingWorkers) func() { + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newNamedPod("1", "ns", "running-pod", false), + }) + return nil + }, + expect: hasContext(&podSyncStatus{ + fullname: "running-pod_ns", + syncedAt: time.Unix(1, 0), + startedAt: time.Unix(3, 0), + terminatingAt: time.Unix(3, 0), + terminatedAt: time.Unix(5, 0), + gracePeriod: 15, + startedTerminating: true, + finished: true, + deleted: true, + activeUpdate: &UpdatePodOptions{ + Pod: withDeletionTimestamp(newNamedPod("1", "ns", "running-pod", false), time.Unix(1, 0), intp(15)), + KillPodOptions: &KillPodOptions{PodTerminationGracePeriodSecondsOverride: intp(15)}, + }, + }), + expectKnownTerminated: true, + }, + { + name: "a running pod is terminated when an eviction is requested", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + Pod: newNamedPod("1", "ns", "running-pod", false), + KillPodOptions: &KillPodOptions{Evict: true}, + }, + prepare: func(t *testing.T, w *timeIncrementingWorkers) func() { + w.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newNamedPod("1", "ns", "running-pod", false), + }) + return nil + }, + expect: hasContext(&podSyncStatus{ + fullname: "running-pod_ns", + syncedAt: time.Unix(1, 0), + startedAt: time.Unix(3, 0), + terminatingAt: time.Unix(3, 0), + terminatedAt: time.Unix(5, 0), + gracePeriod: 30, + startedTerminating: true, + finished: true, + evicted: true, + activeUpdate: &UpdatePodOptions{ + Pod: newNamedPod("1", "ns", "running-pod", false), + KillPodOptions: &KillPodOptions{ + PodTerminationGracePeriodSecondsOverride: intp(30), + Evict: true, + }, + }, + }), + expectKnownTerminated: true, + }, + { + name: "a pod that is terminal and has never started must be terminated if the runtime does not have a cached terminal state", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newPodWithPhase("1", "done-pod", v1.PodSucceeded), + }, + expect: hasContext(&podSyncStatus{ + fullname: "done-pod_ns", + syncedAt: time.Unix(1, 0), + terminatingAt: time.Unix(1, 0), + startedAt: time.Unix(3, 0), + terminatedAt: time.Unix(3, 0), + activeUpdate: &UpdatePodOptions{ + Pod: newPodWithPhase("1", "done-pod", v1.PodSucceeded), + KillPodOptions: &KillPodOptions{PodTerminationGracePeriodSecondsOverride: intp(30)}, + }, + gracePeriod: 30, + startedTerminating: true, + finished: true, + }), + expectKnownTerminated: true, + }, + { + name: "a pod that is terminal and has never started is finished immediately if the runtime has a cached terminal state", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + Pod: newPodWithPhase("1", "done-pod", v1.PodSucceeded), + }, + runtimeStatus: &kubecontainer.PodStatus{ /* we know about this pod */ }, + expect: &podSyncStatus{ + fullname: "done-pod_ns", + syncedAt: time.Unix(1, 0), + terminatingAt: time.Unix(1, 0), + terminatedAt: time.Unix(1, 0), + startedTerminating: true, + finished: true, + + // if we have never seen the pod before, a restart makes no sense + restartRequested: false, + }, + expectKnownTerminated: true, + }, + { + name: "an orphaned running pod we have not seen is marked terminating and advances to finished and then is removed", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + RunningPod: &kubecontainer.Pod{ID: "1", Name: "orphaned-pod", Namespace: "ns"}, + }, + expectBeforeWorker: &podSyncStatus{ + fullname: "orphaned-pod_ns", + syncedAt: time.Unix(1, 0), + terminatingAt: time.Unix(1, 0), + pendingUpdate: &UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + RunningPod: &kubecontainer.Pod{ID: "1", Name: "orphaned-pod", Namespace: "ns"}, + KillPodOptions: &KillPodOptions{PodTerminationGracePeriodSecondsOverride: &one}, + }, + gracePeriod: 1, + deleted: true, + observedRuntime: true, + working: true, + }, + // Once a running pod is fully terminated, we stop tracking it in history, and so it + // is deliberately expected not to be known outside the pod worker since the source of + // the pod is also not in the desired pod set. + expectKnownTerminated: false, + }, + { + name: "an orphaned running pod with a non-kill update type does nothing", + update: UpdatePodOptions{ + UpdateType: kubetypes.SyncPodCreate, + RunningPod: &kubecontainer.Pod{ID: "1", Name: "orphaned-pod", Namespace: "ns"}, + }, + expect: nil, + }, + } { + t.Run(tc.name, func(t *testing.T) { + var uid types.UID + switch { + case tc.update.Pod != nil: + uid = tc.update.Pod.UID + case tc.update.RunningPod != nil: + uid = tc.update.RunningPod.ID + default: + t.Fatalf("unable to find uid for update") + } + + var fns []func() + + podWorkers, _ := createTimeIncrementingPodWorkers() + + if tc.expectBeforeWorker != nil { + fns = append(fns, func() { + expectPodSyncStatus(t, tc.expectBeforeWorker, podWorkers.w.podSyncStatuses[uid]) + }) + } + + if tc.prepare != nil { + if fn := tc.prepare(t, podWorkers); fn != nil { + fns = append(fns, fn) + } + } + + // set up an initial pod status for the UpdatePod invocation which is + // reset before workers call the podCache + if tc.runtimeStatus != nil { + podWorkers.runtime.PodStatus = *tc.runtimeStatus + podWorkers.runtime.Err = nil + } else { + podWorkers.runtime.PodStatus = kubecontainer.PodStatus{} + podWorkers.runtime.Err = status.Error(codes.NotFound, "No such pod") + } + fns = append(fns, func() { + podWorkers.runtime.PodStatus = kubecontainer.PodStatus{} + podWorkers.runtime.Err = nil + }) + + podWorkers.UpdatePod(tc.update, fns...) + + if podWorkers.w.IsPodKnownTerminated(uid) != tc.expectKnownTerminated { + t.Errorf("podWorker.IsPodKnownTerminated expected to be %t", tc.expectKnownTerminated) + } + + expectPodSyncStatus(t, tc.expect, podWorkers.w.podSyncStatuses[uid]) + + // TODO: validate processed records for the pod based on the test case, which reduces + // the amount of testing we need to do in kubelet_pods_test.go + }) } } func TestUpdatePodForRuntimePod(t *testing.T) { - podWorkers, processed := createPodWorkers() + podWorkers, _, processed := createPodWorkers() // ignores running pod of wrong sync type podWorkers.UpdatePod(UpdatePodOptions{ @@ -457,7 +888,7 @@ func TestUpdatePodForRuntimePod(t *testing.T) { } func TestUpdatePodForTerminatedRuntimePod(t *testing.T) { - podWorkers, processed := createPodWorkers() + podWorkers, _, processed := createPodWorkers() now := time.Now() podWorkers.podSyncStatuses[types.UID("1")] = &podSyncStatus{ @@ -480,16 +911,13 @@ func TestUpdatePodForTerminatedRuntimePod(t *testing.T) { if len(updates) != 0 { t.Fatalf("unexpected updates: %v", updates) } - if len(podWorkers.lastUndeliveredWorkUpdate) != 0 { - t.Fatalf("Unexpected undelivered work") - } } func TestUpdatePodDoesNotForgetSyncPodKill(t *testing.T) { - podWorkers, processed := createPodWorkers() + podWorkers, _, processed := createPodWorkers() numPods := 20 for i := 0; i < numPods; i++ { - pod := newPod(strconv.Itoa(i), strconv.Itoa(i)) + pod := newNamedPod(strconv.Itoa(i), "ns", strconv.Itoa(i), false) podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodCreate, @@ -510,17 +938,22 @@ func TestUpdatePodDoesNotForgetSyncPodKill(t *testing.T) { } for i := 0; i < numPods; i++ { uid := types.UID(strconv.Itoa(i)) - // each pod should be processed two times (create, kill, but not update) + // each pod should be processed two or three times (kill,terminate or create,kill,terminate) because + // we buffer pending updates and the pod worker may compress the create and kill syncPodRecords := processed[uid] - if len(syncPodRecords) < 2 { - t.Errorf("Pod %v processed %v times, but expected at least 2", i, len(syncPodRecords)) - continue + var match bool + grace := int64(30) + for _, possible := range [][]syncPodRecord{ + {{name: string(uid), updateType: kubetypes.SyncPodKill, gracePeriod: &grace}, {name: string(uid), terminated: true}}, + {{name: string(uid), updateType: kubetypes.SyncPodCreate}, {name: string(uid), updateType: kubetypes.SyncPodKill, gracePeriod: &grace}, {name: string(uid), terminated: true}}, + } { + if reflect.DeepEqual(possible, syncPodRecords) { + match = true + break + } } - if syncPodRecords[0].updateType != kubetypes.SyncPodCreate { - t.Errorf("Pod %v event was %v, but expected %v", i, syncPodRecords[0].updateType, kubetypes.SyncPodCreate) - } - if syncPodRecords[1].updateType != kubetypes.SyncPodKill { - t.Errorf("Pod %v event was %v, but expected %v", i, syncPodRecords[1].updateType, kubetypes.SyncPodKill) + if !match { + t.Fatalf("unexpected history for pod %v: %#v", i, syncPodRecords) } } } @@ -566,11 +999,11 @@ func newTerminalPhaseSync(fn syncPodFnType) *terminalPhaseSync { } func TestTerminalPhaseTransition(t *testing.T) { - podWorkers, _ := createPodWorkers() + podWorkers, _, _ := createPodWorkers() var channels WorkChannel podWorkers.workerChannelFn = channels.Intercept - terminalPhaseSyncer := newTerminalPhaseSync(podWorkers.syncPodFn) - podWorkers.syncPodFn = terminalPhaseSyncer.SyncPod + terminalPhaseSyncer := newTerminalPhaseSync(podWorkers.podSyncer.(*podSyncerFuncs).syncPod) + podWorkers.podSyncer.(*podSyncerFuncs).syncPod = terminalPhaseSyncer.SyncPod // start pod podWorkers.UpdatePod(UpdatePodOptions{ @@ -618,7 +1051,7 @@ func TestStaticPodExclusion(t *testing.T) { t.Skip("skipping test in short mode.") } - podWorkers, processed := createPodWorkers() + podWorkers, _, processed := createPodWorkers() var channels WorkChannel podWorkers.workerChannelFn = channels.Intercept @@ -812,11 +1245,11 @@ func TestStaticPodExclusion(t *testing.T) { drainAllWorkers(podWorkers) // 2-static and 3-static should both be listed as terminated - if e, a := map[types.UID]PodWorkerState{ - "1-normal": SyncPod, - "2-static": TerminatedPod, - "3-static": TerminatedPod, - "4-static": SyncPod, + if e, a := map[types.UID]PodWorkerSync{ + "1-normal": {State: SyncPod, HasConfig: true}, + "2-static": {State: TerminatedPod, HasConfig: true, Static: true}, + "3-static": {State: TerminatedPod}, + "4-static": {State: SyncPod, HasConfig: true, Static: true}, }, state; !reflect.DeepEqual(e, a) { t.Fatalf("unexpected actual state: %s", cmp.Diff(e, a)) } @@ -834,11 +1267,10 @@ func TestStaticPodExclusion(t *testing.T) { drainAllWorkers(podWorkers) // expect sync to put 3-static into final state and remove the status - if e, a := map[types.UID]PodWorkerState{ - "1-normal": SyncPod, - "2-static": TerminatedPod, - "3-static": TerminatedPod, - "4-static": SyncPod, + if e, a := map[types.UID]PodWorkerSync{ + "1-normal": {State: SyncPod, HasConfig: true}, + "2-static": {State: TerminatedPod, HasConfig: true, Static: true}, + "4-static": {State: SyncPod, HasConfig: true, Static: true}, }, state; !reflect.DeepEqual(e, a) { t.Fatalf("unexpected actual state: %s", cmp.Diff(e, a)) } @@ -936,12 +1368,21 @@ func TestStaticPodExclusion(t *testing.T) { UpdateType: kubetypes.SyncPodKill, }) drainAllWorkers(podWorkers) + podWorkers.UpdatePod(UpdatePodOptions{ + Pod: newNamedPod("6-static", "test1", "pod1", true), + UpdateType: kubetypes.SyncPodCreate, + }) + drainAllWorkers(podWorkers) podWorkers.UpdatePod(UpdatePodOptions{ Pod: newNamedPod("8-static", "test1", "pod1", true), UpdateType: kubetypes.SyncPodUpdate, }) drainAllWorkers(podWorkers) + // 6 should have been detected as restartable + if status := podWorkers.podSyncStatuses["6-static"]; !status.restartRequested { + t.Fatalf("unexpected restarted static pod: %#v", status) + } // 7 and 8 should both be waiting still with no syncs if e, a := map[string]types.UID{}, podWorkers.startedStaticPodsByFullname; !reflect.DeepEqual(e, a) { t.Fatalf("unexpected started static pods: %s", cmp.Diff(e, a)) @@ -985,23 +1426,35 @@ func TestStaticPodExclusion(t *testing.T) { processed[types.UID("8-static")]; !reflect.DeepEqual(expected, actual) { t.Fatalf("unexpected sync pod calls: %s", cmp.Diff(expected, actual)) } + + // initiate a sync with all but 8-static pods undesired + state = podWorkers.SyncKnownPods([]*v1.Pod{ + newNamedPod("8-static", "test1", "pod1", true), + }) + drainAllWorkers(podWorkers) + if e, a := map[types.UID]PodWorkerSync{ + "1-normal": {State: TerminatingPod, Orphan: true, HasConfig: true}, + "8-static": {State: SyncPod, HasConfig: true, Static: true}, + }, state; !reflect.DeepEqual(e, a) { + t.Fatalf("unexpected actual restartable: %s", cmp.Diff(e, a)) + } } type WorkChannelItem struct { - out chan podWork + out chan struct{} lock sync.Mutex pause bool - queue []podWork + queue int } -func (item *WorkChannelItem) Handle(work podWork) { +func (item *WorkChannelItem) Handle() { item.lock.Lock() defer item.lock.Unlock() if item.pause { - item.queue = append(item.queue, work) + item.queue++ return } - item.out <- work + item.out <- struct{}{} } func (item *WorkChannelItem) Hold() { @@ -1024,10 +1477,10 @@ func (item *WorkChannelItem) Release() { item.lock.Lock() defer item.lock.Unlock() item.pause = false - for _, work := range item.queue { - item.out <- work + for i := 0; i < item.queue; i++ { + item.out <- struct{}{} } - item.queue = nil + item.queue = 0 } // WorkChannel intercepts podWork channels between the pod worker and its child @@ -1047,14 +1500,14 @@ func (w *WorkChannel) Channel(uid types.UID) *WorkChannelItem { channel, ok := w.channels[uid] if !ok { channel = &WorkChannelItem{ - out: make(chan podWork, 1), + out: make(chan struct{}, 1), } w.channels[uid] = channel } return channel } -func (w *WorkChannel) Intercept(uid types.UID, ch chan podWork) (outCh <-chan podWork) { +func (w *WorkChannel) Intercept(uid types.UID, ch chan struct{}) (outCh <-chan struct{}) { channel := w.Channel(uid) w.lock.Lock() @@ -1066,20 +1519,20 @@ func (w *WorkChannel) Intercept(uid types.UID, ch chan podWork) (outCh <-chan po defer w.lock.Unlock() delete(w.channels, uid) }() - for w := range ch { - channel.Handle(w) + for range ch { + channel.Handle() } }() return channel.out } func TestSyncKnownPods(t *testing.T) { - podWorkers, _ := createPodWorkers() + podWorkers, _, _ := createPodWorkers() numPods := 20 for i := 0; i < numPods; i++ { podWorkers.UpdatePod(UpdatePodOptions{ - Pod: newPod(strconv.Itoa(i), "name"), + Pod: newNamedPod(strconv.Itoa(i), "ns", "name", false), UpdateType: kubetypes.SyncPodUpdate, }) } @@ -1092,11 +1545,11 @@ func TestSyncKnownPods(t *testing.T) { desiredPods := map[types.UID]sets.Empty{} desiredPods[types.UID("2")] = sets.Empty{} desiredPods[types.UID("14")] = sets.Empty{} - desiredPodList := []*v1.Pod{newPod("2", "name"), newPod("14", "name")} + desiredPodList := []*v1.Pod{newNamedPod("2", "ns", "name", false), newNamedPod("14", "ns", "name", false)} // kill all but the requested pods for i := 0; i < numPods; i++ { - pod := newPod(strconv.Itoa(i), "name") + pod := newNamedPod(strconv.Itoa(i), "ns", "name", false) if _, ok := desiredPods[pod.UID]; ok { continue } @@ -1190,12 +1643,9 @@ func TestSyncKnownPods(t *testing.T) { if len(podWorkers.podSyncStatuses) != 2 { t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.podSyncStatuses) } - if len(podWorkers.lastUndeliveredWorkUpdate) != 0 { - t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.lastUndeliveredWorkUpdate) - } for uid := range desiredPods { - pod := newPod(string(uid), "name") + pod := newNamedPod(string(uid), "ns", "name", false) podWorkers.UpdatePod(UpdatePodOptions{ Pod: pod, UpdateType: kubetypes.SyncPodKill, @@ -1211,9 +1661,6 @@ func TestSyncKnownPods(t *testing.T) { if len(podWorkers.podSyncStatuses) != 0 { t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.podSyncStatuses) } - if len(podWorkers.lastUndeliveredWorkUpdate) != 0 { - t.Errorf("Incorrect number of tracked statuses: %#v", podWorkers.lastUndeliveredWorkUpdate) - } } func Test_removeTerminatedWorker(t *testing.T) { @@ -1221,10 +1668,13 @@ func Test_removeTerminatedWorker(t *testing.T) { testCases := []struct { desc string + orphan bool podSyncStatus *podSyncStatus startedStaticPodsByFullname map[string]types.UID waitingToStartStaticPodsByFullname map[string][]types.UID removed bool + expectGracePeriod int64 + expectPending *UpdatePodOptions }{ { desc: "finished worker", @@ -1259,22 +1709,163 @@ func Test_removeTerminatedWorker(t *testing.T) { }, removed: false, }, + { + desc: "orphaned not started worker", + podSyncStatus: &podSyncStatus{ + finished: false, + fullname: "fake-fullname", + }, + orphan: true, + removed: true, + }, + { + desc: "orphaned started worker", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + finished: false, + fullname: "fake-fullname", + }, + orphan: true, + removed: false, + }, + { + desc: "orphaned terminating worker with no activeUpdate", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + terminatingAt: time.Unix(2, 0), + finished: false, + fullname: "fake-fullname", + }, + orphan: true, + removed: false, + }, + { + desc: "orphaned terminating worker", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + terminatingAt: time.Unix(2, 0), + finished: false, + fullname: "fake-fullname", + activeUpdate: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "1"}}, + }, + }, + orphan: true, + removed: false, + expectPending: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "1"}}, + }, + }, + { + desc: "orphaned terminating worker with pendingUpdate", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + terminatingAt: time.Unix(2, 0), + finished: false, + fullname: "fake-fullname", + working: true, + pendingUpdate: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "2"}}, + }, + activeUpdate: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "1"}}, + }, + }, + orphan: true, + removed: false, + expectPending: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "2"}}, + }, + }, + { + desc: "orphaned terminated worker with no activeUpdate", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + terminatingAt: time.Unix(2, 0), + terminatedAt: time.Unix(3, 0), + finished: false, + fullname: "fake-fullname", + }, + orphan: true, + removed: false, + }, + { + desc: "orphaned terminated worker", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + terminatingAt: time.Unix(2, 0), + terminatedAt: time.Unix(3, 0), + finished: false, + fullname: "fake-fullname", + activeUpdate: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "1"}}, + }, + }, + orphan: true, + removed: false, + expectPending: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "1"}}, + }, + }, + { + desc: "orphaned terminated worker with pendingUpdate", + podSyncStatus: &podSyncStatus{ + startedAt: time.Unix(1, 0), + terminatingAt: time.Unix(2, 0), + terminatedAt: time.Unix(3, 0), + finished: false, + working: true, + fullname: "fake-fullname", + pendingUpdate: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "2"}}, + }, + activeUpdate: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "1"}}, + }, + }, + orphan: true, + removed: false, + expectPending: &UpdatePodOptions{ + Pod: &v1.Pod{ObjectMeta: metav1.ObjectMeta{UID: podUID, Name: "2"}}, + }, + }, } for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - podWorkers, _ := createPodWorkers() + podWorkers, _, _ := createPodWorkers() podWorkers.podSyncStatuses[podUID] = tc.podSyncStatus + podWorkers.podUpdates[podUID] = make(chan struct{}, 1) + if tc.podSyncStatus.working { + podWorkers.podUpdates[podUID] <- struct{}{} + } podWorkers.startedStaticPodsByFullname = tc.startedStaticPodsByFullname podWorkers.waitingToStartStaticPodsByFullname = tc.waitingToStartStaticPodsByFullname - podWorkers.removeTerminatedWorker(podUID) - _, exists := podWorkers.podSyncStatuses[podUID] + podWorkers.removeTerminatedWorker(podUID, podWorkers.podSyncStatuses[podUID], tc.orphan) + status, exists := podWorkers.podSyncStatuses[podUID] if tc.removed && exists { - t.Errorf("Expected pod worker to be removed") + t.Fatalf("Expected pod worker to be removed") } if !tc.removed && !exists { - t.Errorf("Expected pod worker to not be removed") + t.Fatalf("Expected pod worker to not be removed") + } + if tc.removed { + return + } + if tc.expectGracePeriod > 0 && status.gracePeriod != tc.expectGracePeriod { + t.Errorf("Unexpected grace period %d", status.gracePeriod) + } + if !reflect.DeepEqual(tc.expectPending, status.pendingUpdate) { + t.Errorf("Unexpected pending: %s", cmp.Diff(tc.expectPending, status.pendingUpdate)) + } + if tc.expectPending != nil { + if !status.working { + t.Errorf("Should be working") + } + if len(podWorkers.podUpdates[podUID]) != 1 { + t.Errorf("Should have one entry in podUpdates") + } } }) } @@ -1287,22 +1878,26 @@ type simpleFakeKubelet struct { wg sync.WaitGroup } -func (kl *simpleFakeKubelet) syncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { +func (kl *simpleFakeKubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { kl.pod, kl.mirrorPod, kl.podStatus = pod, mirrorPod, podStatus return false, nil } -func (kl *simpleFakeKubelet) syncPodWithWaitGroup(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { +func (kl *simpleFakeKubelet) SyncPodWithWaitGroup(ctx context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (bool, error) { kl.pod, kl.mirrorPod, kl.podStatus = pod, mirrorPod, podStatus kl.wg.Done() return false, nil } -func (kl *simpleFakeKubelet) syncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, runningPod *kubecontainer.Pod, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { +func (kl *simpleFakeKubelet) SyncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error { return nil } -func (kl *simpleFakeKubelet) syncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { +func (kl *simpleFakeKubelet) SyncTerminatingRuntimePod(ctx context.Context, runningPod *kubecontainer.Pod) error { + return nil +} + +func (kl *simpleFakeKubelet) SyncTerminatedPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus) error { return nil } @@ -1315,14 +1910,14 @@ func TestFakePodWorkers(t *testing.T) { kubeletForRealWorkers := &simpleFakeKubelet{} kubeletForFakeWorkers := &simpleFakeKubelet{} + realPodSyncer := newPodSyncerFuncs(kubeletForRealWorkers) + realPodSyncer.syncPod = kubeletForRealWorkers.SyncPodWithWaitGroup realPodWorkers := newPodWorkers( - kubeletForRealWorkers.syncPodWithWaitGroup, - kubeletForRealWorkers.syncTerminatingPod, - kubeletForRealWorkers.syncTerminatedPod, + realPodSyncer, fakeRecorder, queue.NewBasicWorkQueue(&clock.RealClock{}), time.Second, time.Second, fakeCache) fakePodWorkers := &fakePodWorkers{ - syncPodFn: kubeletForFakeWorkers.syncPod, + syncPodFn: kubeletForFakeWorkers.SyncPod, cache: fakeCache, t: t, } @@ -1377,9 +1972,9 @@ func TestFakePodWorkers(t *testing.T) { // TestKillPodNowFunc tests the blocking kill pod function works with pod workers as expected. func TestKillPodNowFunc(t *testing.T) { fakeRecorder := &record.FakeRecorder{} - podWorkers, processed := createPodWorkers() + podWorkers, _, processed := createPodWorkers() killPodFunc := killPodNow(podWorkers, fakeRecorder) - pod := newPod("test", "test") + pod := newNamedPod("test", "ns", "test", false) gracePeriodOverride := int64(0) err := killPodFunc(pod, false, &gracePeriodOverride, func(status *v1.PodStatus) { status.Phase = v1.PodFailed @@ -1422,7 +2017,7 @@ func Test_allowPodStart(t *testing.T) { // TODO: Do we want to allow non-static pods with the same full name? // Note that it may disable the force deletion of pods. desc: "non-static pod", - pod: newPod("uid-0", "test"), + pod: newNamedPod("uid-0", "ns", "test", false), podSyncStatuses: map[types.UID]*podSyncStatus{ "uid-0": { fullname: "test_", @@ -1438,7 +2033,7 @@ func Test_allowPodStart(t *testing.T) { // TODO: Do we want to allow a non-static pod with the same full name // as the started static pod? desc: "non-static pod when there is a started static pod with the same full name", - pod: newPod("uid-0", "test"), + pod: newNamedPod("uid-0", "ns", "test", false), podSyncStatuses: map[types.UID]*podSyncStatus{ "uid-0": { fullname: "test_", @@ -1460,7 +2055,7 @@ func Test_allowPodStart(t *testing.T) { // TODO: Do we want to allow a static pod with the same full name as the // started non-static pod? desc: "static pod when there is a started non-static pod with the same full name", - pod: newPod("uid-0", "test"), + pod: newNamedPod("uid-0", "ns", "test", false), podSyncStatuses: map[types.UID]*podSyncStatus{ "uid-0": { fullname: "test_", @@ -1709,7 +2304,7 @@ func Test_allowPodStart(t *testing.T) { for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - podWorkers, _ := createPodWorkers() + podWorkers, _, _ := createPodWorkers() if tc.podSyncStatuses != nil { podWorkers.podSyncStatuses = tc.podSyncStatuses } diff --git a/pkg/kubelet/runonce.go b/pkg/kubelet/runonce.go index 3c8d14c9e5a..8b63d368d07 100644 --- a/pkg/kubelet/runonce.go +++ b/pkg/kubelet/runonce.go @@ -133,7 +133,7 @@ func (kl *Kubelet) runPod(ctx context.Context, pod *v1.Pod, retryDelay time.Dura klog.ErrorS(err, "Failed creating a mirror pod", "pod", klog.KObj(pod)) } mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod) - if isTerminal, err = kl.syncPod(ctx, kubetypes.SyncPodUpdate, pod, mirrorPod, status); err != nil { + if isTerminal, err = kl.SyncPod(ctx, kubetypes.SyncPodUpdate, pod, mirrorPod, status); err != nil { return fmt.Errorf("error syncing pod %q: %v", format.Pod(pod), err) } if retry >= runOnceMaxRetries { diff --git a/test/e2e/framework/pod/output/output.go b/test/e2e/framework/pod/output/output.go index 1fa8122a3a4..6cb80f37d60 100644 --- a/test/e2e/framework/pod/output/output.go +++ b/test/e2e/framework/pod/output/output.go @@ -121,6 +121,15 @@ func LookForStringInLog(ns, podName, container, expectedString string, timeout t }) } +// LookForStringInLogWithoutKubectl looks for the given string in the log of a specific pod container +func LookForStringInLogWithoutKubectl(ctx context.Context, client clientset.Interface, ns string, podName string, container string, expectedString string, timeout time.Duration) (result string, err error) { + return lookForString(expectedString, timeout, func() string { + podLogs, err := e2epod.GetPodLogs(ctx, client, ns, podName, container) + framework.ExpectNoError(err) + return podLogs + }) +} + // CreateEmptyFileOnPod creates empty file at given path on the pod. func CreateEmptyFileOnPod(namespace string, podName string, filePath string) error { _, err := e2ekubectl.RunKubectl(namespace, "exec", podName, "--", "/bin/sh", "-c", fmt.Sprintf("touch %s", filePath)) diff --git a/test/e2e/framework/volume/fixtures.go b/test/e2e/framework/volume/fixtures.go index 8d43a5645fc..031a187ab33 100644 --- a/test/e2e/framework/volume/fixtures.go +++ b/test/e2e/framework/volume/fixtures.go @@ -149,6 +149,10 @@ type Test struct { // NewNFSServer is a NFS-specific wrapper for CreateStorageServer. func NewNFSServer(ctx context.Context, cs clientset.Interface, namespace string, args []string) (config TestConfig, pod *v1.Pod, host string) { + return NewNFSServerWithNodeName(ctx, cs, namespace, args, "") +} + +func NewNFSServerWithNodeName(ctx context.Context, cs clientset.Interface, namespace string, args []string, nodeName string) (config TestConfig, pod *v1.Pod, host string) { config = TestConfig{ Namespace: namespace, Prefix: "nfs", @@ -157,6 +161,10 @@ func NewNFSServer(ctx context.Context, cs clientset.Interface, namespace string, ServerVolumes: map[string]string{"": "/exports"}, ServerReadyMessage: "NFS started", } + if nodeName != "" { + config.ClientNodeSelection = e2epod.NodeSelection{Name: nodeName} + } + if len(args) > 0 { config.ServerArgs = args } @@ -329,6 +337,10 @@ func startVolumeServer(ctx context.Context, client clientset.Interface, config T }, } + if config.ClientNodeSelection.Name != "" { + serverPod.Spec.NodeName = config.ClientNodeSelection.Name + } + var pod *v1.Pod serverPod, err := podClient.Create(ctx, serverPod, metav1.CreateOptions{}) // ok if the server pod already exists. TODO: make this controllable by callers @@ -355,7 +367,7 @@ func startVolumeServer(ctx context.Context, client clientset.Interface, config T } } if config.ServerReadyMessage != "" { - _, err := e2epodoutput.LookForStringInLog(pod.Namespace, pod.Name, serverPodName, config.ServerReadyMessage, VolumeServerPodStartupTimeout) + _, err := e2epodoutput.LookForStringInLogWithoutKubectl(ctx, client, pod.Namespace, pod.Name, serverPodName, config.ServerReadyMessage, VolumeServerPodStartupTimeout) framework.ExpectNoError(err, "Failed to find %q in pod logs: %s", config.ServerReadyMessage, err) } return pod diff --git a/test/e2e_node/mirror_pod_grace_period_test.go b/test/e2e_node/mirror_pod_grace_period_test.go index 6bc3dc2933d..ec3883b3f5c 100644 --- a/test/e2e_node/mirror_pod_grace_period_test.go +++ b/test/e2e_node/mirror_pod_grace_period_test.go @@ -24,6 +24,8 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + "github.com/onsi/gomega/gstruct" + "github.com/prometheus/common/model" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -131,6 +133,174 @@ var _ = SIGDescribe("MirrorPodWithGracePeriod", func() { framework.ExpectEqual(pod.Spec.Containers[0].Image, image) }) + ginkgo.Context("and the container runtime is temporarily down during pod termination [NodeConformance] [Serial] [Disruptive]", func() { + ginkgo.It("the mirror pod should terminate successfully", func(ctx context.Context) { + ginkgo.By("verifying the pod is described as syncing in metrics") + gomega.Eventually(ctx, getKubeletMetrics, 5*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ + "kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_working_pods{config="desired", lifecycle="sync", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`: timelessSample(1), + `kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`: timelessSample(0), + }), + "kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_mirror_pods`: timelessSample(1), + }), + "kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_active_pods{static=""}`: timelessSample(0), + `kubelet_active_pods{static="true"}`: timelessSample(1), + }), + "kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_desired_pods{static=""}`: timelessSample(0), + `kubelet_desired_pods{static="true"}`: timelessSample(1), + }), + })) + + ginkgo.By("delete the static pod") + err := deleteStaticPod(podPath, staticPodName, ns) + framework.ExpectNoError(err) + + // Note it is important we have a small delay here as we would like to reproduce https://issues.k8s.io/113091 which requires a failure in syncTerminatingPod() + // This requires waiting a small period between the static pod being deleted so that syncTerminatingPod() will attempt to run + ginkgo.By("sleeping before stopping the container runtime") + time.Sleep(2 * time.Second) + + ginkgo.By("stop the container runtime") + err = stopContainerRuntime() + framework.ExpectNoError(err, "expected no error stopping the container runtime") + + ginkgo.By("waiting for the container runtime to be stopped") + gomega.Eventually(ctx, func(ctx context.Context) error { + _, _, err := getCRIClient() + return err + }, 2*time.Minute, time.Second*5).ShouldNot(gomega.Succeed()) + + ginkgo.By("verifying the mirror pod is running") + gomega.Consistently(ctx, func(ctx context.Context) error { + return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns) + }, 19*time.Second, 200*time.Millisecond).Should(gomega.BeNil()) + + ginkgo.By("verifying the pod is described as terminating in metrics") + gomega.Eventually(ctx, getKubeletMetrics, 5*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ + "kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_working_pods{config="desired", lifecycle="sync", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`: timelessSample(1), + `kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`: timelessSample(0), + }), + "kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_mirror_pods`: timelessSample(1), + }), + "kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_active_pods{static=""}`: timelessSample(0), + // TODO: the pod is still running and consuming resources, it should be considered in + // admission https://github.com/kubernetes/kubernetes/issues/104824 for static pods at + // least, which means it should be 1 + `kubelet_active_pods{static="true"}`: timelessSample(0), + }), + "kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_desired_pods{static=""}`: timelessSample(0), + `kubelet_desired_pods{static="true"}`: timelessSample(0), + })})) + + ginkgo.By("start the container runtime") + err = startContainerRuntime() + framework.ExpectNoError(err, "expected no error starting the container runtime") + ginkgo.By("waiting for the container runtime to start") + gomega.Eventually(ctx, func(ctx context.Context) error { + r, _, err := getCRIClient() + if err != nil { + return fmt.Errorf("error getting CRI client: %w", err) + } + status, err := r.Status(ctx, true) + if err != nil { + return fmt.Errorf("error checking CRI status: %w", err) + } + framework.Logf("Runtime started: %#v", status) + return nil + }, 2*time.Minute, time.Second*5).Should(gomega.Succeed()) + + ginkgo.By(fmt.Sprintf("verifying that the mirror pod (%s/%s) stops running after about 30s", ns, mirrorPodName)) + // from the time the container runtime starts, it should take a maximum of: + // 20s (grace period) + 2 sync transitions * 1s + 2s between housekeeping + 3s to detect CRI up + + // 2s overhead + // which we calculate here as "about 30s", so we try a bit longer than that but verify that it is + // tightly bounded by not waiting longer (we want to catch regressions to shutdown) + time.Sleep(30 * time.Second) + gomega.Eventually(ctx, func(ctx context.Context) error { + return checkMirrorPodDisappear(ctx, f.ClientSet, mirrorPodName, ns) + }, time.Second*3, time.Second).Should(gomega.Succeed()) + + ginkgo.By("verifying the pod finishes terminating and is removed from metrics") + gomega.Eventually(ctx, getKubeletMetrics, 15*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ + "kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_working_pods{config="desired", lifecycle="sync", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`: timelessSample(0), + `kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`: timelessSample(0), + `kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`: timelessSample(0), + `kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`: timelessSample(0), + }), + "kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_mirror_pods`: timelessSample(0), + }), + "kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_active_pods{static=""}`: timelessSample(0), + `kubelet_active_pods{static="true"}`: timelessSample(0), + }), + "kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{ + `kubelet_desired_pods{static=""}`: timelessSample(0), + `kubelet_desired_pods{static="true"}`: timelessSample(0), + }), + })) + }) + + ginkgo.AfterEach(func(ctx context.Context) { + ginkgo.By("starting the container runtime") + err := startContainerRuntime() + framework.ExpectNoError(err, "expected no error starting the container runtime") + ginkgo.By("waiting for the container runtime to start") + gomega.Eventually(ctx, func(ctx context.Context) error { + _, _, err := getCRIClient() + if err != nil { + return fmt.Errorf("error getting cri client: %v", err) + } + return nil + }, 2*time.Minute, time.Second*5).Should(gomega.Succeed()) + }) + }) + ginkgo.AfterEach(func(ctx context.Context) { ginkgo.By("delete the static pod") err := deleteStaticPod(podPath, staticPodName, ns) @@ -197,3 +367,8 @@ func checkMirrorPodRunningWithUID(ctx context.Context, cl clientset.Interface, n } return nil } + +func sampleLabelID(element interface{}) string { + el := element.(*model.Sample) + return el.Metric.String() +} diff --git a/test/e2e_node/mirror_pod_test.go b/test/e2e_node/mirror_pod_test.go index d467e8a5a75..d10f1c9a45a 100644 --- a/test/e2e_node/mirror_pod_test.go +++ b/test/e2e_node/mirror_pod_test.go @@ -18,7 +18,6 @@ package e2enode import ( "context" - goerrors "errors" "fmt" "os" "path/filepath" @@ -40,11 +39,13 @@ import ( "github.com/google/go-cmp/cmp" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + "k8s.io/cli-runtime/pkg/printers" + e2evolume "k8s.io/kubernetes/test/e2e/framework/volume" ) var _ = SIGDescribe("MirrorPod", func() { f := framework.NewDefaultFramework("mirror-pod") - f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline + f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged ginkgo.Context("when create a mirror pod ", func() { var ns, podPath, staticPodName, mirrorPodName string ginkgo.BeforeEach(func(ctx context.Context) { @@ -196,8 +197,179 @@ var _ = SIGDescribe("MirrorPod", func() { }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) }) }) + ginkgo.Context("when recreating a static pod", func() { + var ns, podPath, staticPodName, mirrorPodName string + ginkgo.It("it should launch successfully even if it temporarily failed termination due to volume failing to unmount [NodeConformance] [Serial]", func(ctx context.Context) { + node := getNodeName(ctx, f) + ns = f.Namespace.Name + c := f.ClientSet + nfsTestConfig, nfsServerPod, nfsServerHost := e2evolume.NewNFSServerWithNodeName(ctx, c, ns, []string{"-G", "777", "/exports"}, node) + ginkgo.DeferCleanup(func(ctx context.Context) { + framework.Logf("Cleaning up NFS server pod") + e2evolume.TestServerCleanup(ctx, f, nfsTestConfig) + }) + + podPath = framework.TestContext.KubeletConfig.StaticPodPath + staticPodName = "static-pod-nfs-test-pod" + string(uuid.NewUUID()) + mirrorPodName = staticPodName + "-" + framework.TestContext.NodeName + + ginkgo.By(fmt.Sprintf("Creating nfs test pod: %s", staticPodName)) + + err := createStaticPodUsingNfs(nfsServerHost, node, "sleep 999999", podPath, staticPodName, ns) + framework.ExpectNoError(err) + ginkgo.By(fmt.Sprintf("Wating for nfs test pod: %s to start running...", staticPodName)) + gomega.Eventually(func() error { + return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns) + }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) + + mirrorPod, err := c.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{}) + framework.ExpectNoError(err) + + hash, ok := mirrorPod.Annotations[kubetypes.ConfigHashAnnotationKey] + if !ok || hash == "" { + framework.Failf("Failed to get hash for mirrorPod") + } + + ginkgo.By("Stopping the NFS server") + stopNfsServer(f, nfsServerPod) + + ginkgo.By("Waiting for NFS server to stop...") + time.Sleep(30 * time.Second) + + ginkgo.By(fmt.Sprintf("Deleting the static nfs test pod: %s", staticPodName)) + err = deleteStaticPod(podPath, staticPodName, ns) + framework.ExpectNoError(err) + + // Wait 5 mins for syncTerminatedPod to fail. We expect that the pod volume should not be cleaned up because the NFS server is down. + gomega.Consistently(func() bool { + return podVolumeDirectoryExists(types.UID(hash)) + }, 5*time.Minute, 10*time.Second).Should(gomega.BeTrue(), "pod volume should exist while nfs server is stopped") + + ginkgo.By("Start the NFS server") + restartNfsServer(f, nfsServerPod) + + ginkgo.By("Waiting for the pod volume to deleted after the NFS server is started") + gomega.Eventually(func() bool { + return podVolumeDirectoryExists(types.UID(hash)) + }, 5*time.Minute, 10*time.Second).Should(gomega.BeFalse(), "pod volume should be deleted after nfs server is started") + + // Create the static pod again with the same config and expect it to start running + err = createStaticPodUsingNfs(nfsServerHost, node, "sleep 999999", podPath, staticPodName, ns) + framework.ExpectNoError(err) + ginkgo.By(fmt.Sprintf("Wating for nfs test pod: %s to start running (after being recreated)", staticPodName)) + gomega.Eventually(func() error { + return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns) + }, 5*time.Minute, 5*time.Second).Should(gomega.BeNil()) + }) + + ginkgo.AfterEach(func(ctx context.Context) { + ginkgo.By("delete the static pod") + err := deleteStaticPod(podPath, staticPodName, ns) + framework.ExpectNoError(err) + + ginkgo.By("wait for the mirror pod to disappear") + gomega.Eventually(ctx, func(ctx context.Context) error { + return checkMirrorPodDisappear(ctx, f.ClientSet, mirrorPodName, ns) + }, 2*time.Minute, time.Second*4).Should(gomega.BeNil()) + + }) + + }) + }) +func podVolumeDirectoryExists(uid types.UID) bool { + podVolumePath := fmt.Sprintf("/var/lib/kubelet/pods/%s/volumes/", uid) + var podVolumeDirectoryExists bool + + if _, err := os.Stat(podVolumePath); !os.IsNotExist(err) { + podVolumeDirectoryExists = true + } + + return podVolumeDirectoryExists +} + +// Restart the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 1` command in the +// pod's (only) container. This command changes the number of nfs server threads from +// (presumably) zero back to 1, and therefore allows nfs to open connections again. +func restartNfsServer(f *framework.Framework, serverPod *v1.Pod) { + const startcmd = "/usr/sbin/rpc.nfsd 1" + _, _, err := e2evolume.PodExec(f, serverPod, startcmd) + framework.ExpectNoError(err) + +} + +// Stop the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 0` command in the +// pod's (only) container. This command changes the number of nfs server threads to 0, +// thus closing all open nfs connections. +func stopNfsServer(f *framework.Framework, serverPod *v1.Pod) { + const stopcmd = "/usr/sbin/rpc.nfsd 0" + _, _, err := e2evolume.PodExec(f, serverPod, stopcmd) + framework.ExpectNoError(err) +} + +func createStaticPodUsingNfs(nfsIP string, nodeName string, cmd string, dir string, name string, ns string) error { + ginkgo.By("create pod using nfs volume") + + isPrivileged := true + cmdLine := []string{"-c", cmd} + pod := &v1.Pod{ + TypeMeta: metav1.TypeMeta{ + Kind: "Pod", + APIVersion: "v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: ns, + }, + Spec: v1.PodSpec{ + NodeName: nodeName, + Containers: []v1.Container{ + { + Name: "pod-nfs-vol", + Image: imageutils.GetE2EImage(imageutils.BusyBox), + Command: []string{"/bin/sh"}, + Args: cmdLine, + VolumeMounts: []v1.VolumeMount{ + { + Name: "nfs-vol", + MountPath: "/mnt", + }, + }, + SecurityContext: &v1.SecurityContext{ + Privileged: &isPrivileged, + }, + }, + }, + RestartPolicy: v1.RestartPolicyNever, //don't restart pod + Volumes: []v1.Volume{ + { + Name: "nfs-vol", + VolumeSource: v1.VolumeSource{ + NFS: &v1.NFSVolumeSource{ + Server: nfsIP, + Path: "/exports", + ReadOnly: false, + }, + }, + }, + }, + }, + } + + file := staticPodPath(dir, name, ns) + f, err := os.OpenFile(file, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666) + if err != nil { + return err + } + defer f.Close() + + y := printers.YAMLPrinter{} + y.PrintObj(pod, f) + + return nil +} + func staticPodPath(dir, name, namespace string) string { return filepath.Join(dir, namespace+"-"+name+".yaml") } @@ -238,7 +410,10 @@ func checkMirrorPodDisappear(ctx context.Context, cl clientset.Interface, name, if apierrors.IsNotFound(err) { return nil } - return goerrors.New("pod not disappear") + if err == nil { + return fmt.Errorf("mirror pod %v/%v still exists", namespace, name) + } + return fmt.Errorf("expect mirror pod %v/%v to not exist but got error: %w", namespace, name, err) } func checkMirrorPodRunning(ctx context.Context, cl clientset.Interface, name, namespace string) error { diff --git a/test/e2e_node/remote/node_e2e.go b/test/e2e_node/remote/node_e2e.go index cf5efaee111..eeff0dd728a 100644 --- a/test/e2e_node/remote/node_e2e.go +++ b/test/e2e_node/remote/node_e2e.go @@ -87,15 +87,6 @@ func (n *NodeE2ERemote) SetupTestPackage(tardir, systemSpecName string) error { return nil } -// prependCOSMounterFlag prepends the flag for setting the GCI mounter path to -// args and returns the result. -func prependCOSMounterFlag(args, host, workspace string) (string, error) { - klog.V(2).Infof("GCI/COS node and GCI/COS mounter both detected, modifying --experimental-mounter-path accordingly") - mounterPath := filepath.Join(workspace, "mounter") - args = fmt.Sprintf("--kubelet-flags=--experimental-mounter-path=%s ", mounterPath) + args - return args, nil -} - // prependMemcgNotificationFlag prepends the flag for enabling memcg // notification to args and returns the result. func prependMemcgNotificationFlag(args string) string { @@ -124,8 +115,7 @@ func osSpecificActions(args, host, workspace string) (string, error) { return args, setKubeletSELinuxLabels(host, workspace) case strings.Contains(output, "gci"), strings.Contains(output, "cos"): args = prependMemcgNotificationFlag(args) - args = prependGCPCredentialProviderFlag(args, workspace) - return prependCOSMounterFlag(args, host, workspace) + return prependGCPCredentialProviderFlag(args, workspace), nil case strings.Contains(output, "ubuntu"): args = prependGCPCredentialProviderFlag(args, workspace) return prependMemcgNotificationFlag(args), nil diff --git a/test/e2e_node/resource_collector.go b/test/e2e_node/resource_collector.go index 60178b9d764..1d4602af6db 100644 --- a/test/e2e_node/resource_collector.go +++ b/test/e2e_node/resource_collector.go @@ -23,11 +23,8 @@ import ( "bytes" "context" "fmt" - "io" "log" - "os" "sort" - "strconv" "strings" "sync" "text/tabwriter" @@ -39,11 +36,9 @@ import ( v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/wait" kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1" - "k8s.io/kubernetes/pkg/util/procfs" "k8s.io/kubernetes/test/e2e/framework" e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" @@ -465,38 +460,6 @@ func (r *ResourceCollector) GetResourceTimeSeries() map[string]*perftype.Resourc const kubeletProcessName = "kubelet" -func getPidsForProcess(name, pidFile string) ([]int, error) { - if len(pidFile) > 0 { - pid, err := getPidFromPidFile(pidFile) - if err == nil { - return []int{pid}, nil - } - // log the error and fall back to pidof - runtime.HandleError(err) - } - return procfs.PidOf(name) -} - -func getPidFromPidFile(pidFile string) (int, error) { - file, err := os.Open(pidFile) - if err != nil { - return 0, fmt.Errorf("error opening pid file %s: %w", pidFile, err) - } - defer file.Close() - - data, err := io.ReadAll(file) - if err != nil { - return 0, fmt.Errorf("error reading pid file %s: %w", pidFile, err) - } - - pid, err := strconv.Atoi(string(data)) - if err != nil { - return 0, fmt.Errorf("error parsing %s as a number: %w", string(data), err) - } - - return pid, nil -} - func getContainerNameForProcess(name, pidFile string) (string, error) { pids, err := getPidsForProcess(name, pidFile) if err != nil { diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 9ca1d505acd..82068aead32 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -25,17 +25,21 @@ import ( "io" "net" "net/http" + "os" "os/exec" "regexp" "strconv" "strings" "time" + "k8s.io/kubernetes/pkg/util/procfs" + oteltrace "go.opentelemetry.io/otel/trace" v1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" utilfeature "k8s.io/apiserver/pkg/util/feature" clientset "k8s.io/client-go/kubernetes" @@ -55,6 +59,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util" + "github.com/coreos/go-systemd/v22/dbus" "k8s.io/kubernetes/test/e2e/framework" e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet" e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" @@ -84,12 +89,14 @@ const ( var kubeletHealthCheckURL = fmt.Sprintf("http://127.0.0.1:%d/healthz", ports.KubeletHealthzPort) +var containerRuntimeUnitName = "" + func getNodeSummary(ctx context.Context) (*stats.Summary, error) { kubeletConfig, err := getCurrentKubeletConfig(ctx) if err != nil { return nil, fmt.Errorf("failed to get current kubelet config") } - req, err := http.NewRequest("GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil) + req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil) if err != nil { return nil, fmt.Errorf("failed to build http request: %w", err) } @@ -340,6 +347,71 @@ func findKubeletServiceName(running bool) string { return kubeletServiceName } +func findContainerRuntimeServiceName() (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + conn, err := dbus.NewWithContext(ctx) + framework.ExpectNoError(err, "Failed to setup dbus connection") + defer conn.Close() + + runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile) + framework.ExpectNoError(err, "failed to get list of container runtime pids") + framework.ExpectEqual(len(runtimePids), 1, "Unexpected number of container runtime pids. Expected 1 but got %v", len(runtimePids)) + + containerRuntimePid := runtimePids[0] + + unitName, err := conn.GetUnitNameByPID(ctx, uint32(containerRuntimePid)) + framework.ExpectNoError(err, "Failed to get container runtime unit name") + + return unitName, nil +} + +type containerRuntimeUnitOp int + +const ( + startContainerRuntimeUnitOp containerRuntimeUnitOp = iota + stopContainerRuntimeUnitOp +) + +func performContainerRuntimeUnitOp(op containerRuntimeUnitOp) error { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + conn, err := dbus.NewWithContext(ctx) + framework.ExpectNoError(err, "Failed to setup dbus connection") + defer conn.Close() + + if containerRuntimeUnitName == "" { + containerRuntimeUnitName, err = findContainerRuntimeServiceName() + framework.ExpectNoError(err, "Failed to find container runtime name") + } + + reschan := make(chan string) + + switch op { + case startContainerRuntimeUnitOp: + conn.StartUnitContext(ctx, containerRuntimeUnitName, "replace", reschan) + case stopContainerRuntimeUnitOp: + conn.StopUnitContext(ctx, containerRuntimeUnitName, "replace", reschan) + default: + framework.Failf("Unexpected container runtime op: %v", op) + } + + job := <-reschan + framework.ExpectEqual(job, "done", "Expected job to complete with done") + + return nil +} + +func stopContainerRuntime() error { + return performContainerRuntimeUnitOp(stopContainerRuntimeUnitOp) +} + +func startContainerRuntime() error { + return performContainerRuntimeUnitOp(startContainerRuntimeUnitOp) +} + // restartKubelet restarts the current kubelet service. // the "current" kubelet service is the instance managed by the current e2e_node test run. // If `running` is true, restarts only if the current kubelet is actually running. In some cases, @@ -465,3 +537,35 @@ func waitForAllContainerRemoval(ctx context.Context, podName, podNS string) { return nil }, 2*time.Minute, 1*time.Second).Should(gomega.Succeed()) } + +func getPidsForProcess(name, pidFile string) ([]int, error) { + if len(pidFile) > 0 { + pid, err := getPidFromPidFile(pidFile) + if err == nil { + return []int{pid}, nil + } + // log the error and fall back to pidof + runtime.HandleError(err) + } + return procfs.PidOf(name) +} + +func getPidFromPidFile(pidFile string) (int, error) { + file, err := os.Open(pidFile) + if err != nil { + return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err) + } + defer file.Close() + + data, err := io.ReadAll(file) + if err != nil { + return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err) + } + + pid, err := strconv.Atoi(string(data)) + if err != nil { + return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err) + } + + return pid, nil +}