Merge pull request #113145 from smarterclayton/zombie_terminating_pods

kubelet: Force deleted pods can fail to move out of terminating
This commit is contained in:
Kubernetes Prow Robot
2023-03-09 15:32:30 -08:00
committed by GitHub
18 changed files with 3316 additions and 722 deletions

View File

@@ -24,6 +24,8 @@ import (
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"github.com/onsi/gomega/gstruct"
"github.com/prometheus/common/model"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
@@ -131,6 +133,174 @@ var _ = SIGDescribe("MirrorPodWithGracePeriod", func() {
framework.ExpectEqual(pod.Spec.Containers[0].Image, image)
})
ginkgo.Context("and the container runtime is temporarily down during pod termination [NodeConformance] [Serial] [Disruptive]", func() {
ginkgo.It("the mirror pod should terminate successfully", func(ctx context.Context) {
ginkgo.By("verifying the pod is described as syncing in metrics")
gomega.Eventually(ctx, getKubeletMetrics, 5*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_working_pods{config="desired", lifecycle="sync", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`: timelessSample(1),
`kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`: timelessSample(0),
}),
"kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_mirror_pods`: timelessSample(1),
}),
"kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_active_pods{static=""}`: timelessSample(0),
`kubelet_active_pods{static="true"}`: timelessSample(1),
}),
"kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_desired_pods{static=""}`: timelessSample(0),
`kubelet_desired_pods{static="true"}`: timelessSample(1),
}),
}))
ginkgo.By("delete the static pod")
err := deleteStaticPod(podPath, staticPodName, ns)
framework.ExpectNoError(err)
// Note it is important we have a small delay here as we would like to reproduce https://issues.k8s.io/113091 which requires a failure in syncTerminatingPod()
// This requires waiting a small period between the static pod being deleted so that syncTerminatingPod() will attempt to run
ginkgo.By("sleeping before stopping the container runtime")
time.Sleep(2 * time.Second)
ginkgo.By("stop the container runtime")
err = stopContainerRuntime()
framework.ExpectNoError(err, "expected no error stopping the container runtime")
ginkgo.By("waiting for the container runtime to be stopped")
gomega.Eventually(ctx, func(ctx context.Context) error {
_, _, err := getCRIClient()
return err
}, 2*time.Minute, time.Second*5).ShouldNot(gomega.Succeed())
ginkgo.By("verifying the mirror pod is running")
gomega.Consistently(ctx, func(ctx context.Context) error {
return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns)
}, 19*time.Second, 200*time.Millisecond).Should(gomega.BeNil())
ginkgo.By("verifying the pod is described as terminating in metrics")
gomega.Eventually(ctx, getKubeletMetrics, 5*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_working_pods{config="desired", lifecycle="sync", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`: timelessSample(1),
`kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`: timelessSample(0),
}),
"kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_mirror_pods`: timelessSample(1),
}),
"kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_active_pods{static=""}`: timelessSample(0),
// TODO: the pod is still running and consuming resources, it should be considered in
// admission https://github.com/kubernetes/kubernetes/issues/104824 for static pods at
// least, which means it should be 1
`kubelet_active_pods{static="true"}`: timelessSample(0),
}),
"kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_desired_pods{static=""}`: timelessSample(0),
`kubelet_desired_pods{static="true"}`: timelessSample(0),
})}))
ginkgo.By("start the container runtime")
err = startContainerRuntime()
framework.ExpectNoError(err, "expected no error starting the container runtime")
ginkgo.By("waiting for the container runtime to start")
gomega.Eventually(ctx, func(ctx context.Context) error {
r, _, err := getCRIClient()
if err != nil {
return fmt.Errorf("error getting CRI client: %w", err)
}
status, err := r.Status(ctx, true)
if err != nil {
return fmt.Errorf("error checking CRI status: %w", err)
}
framework.Logf("Runtime started: %#v", status)
return nil
}, 2*time.Minute, time.Second*5).Should(gomega.Succeed())
ginkgo.By(fmt.Sprintf("verifying that the mirror pod (%s/%s) stops running after about 30s", ns, mirrorPodName))
// from the time the container runtime starts, it should take a maximum of:
// 20s (grace period) + 2 sync transitions * 1s + 2s between housekeeping + 3s to detect CRI up +
// 2s overhead
// which we calculate here as "about 30s", so we try a bit longer than that but verify that it is
// tightly bounded by not waiting longer (we want to catch regressions to shutdown)
time.Sleep(30 * time.Second)
gomega.Eventually(ctx, func(ctx context.Context) error {
return checkMirrorPodDisappear(ctx, f.ClientSet, mirrorPodName, ns)
}, time.Second*3, time.Second).Should(gomega.Succeed())
ginkgo.By("verifying the pod finishes terminating and is removed from metrics")
gomega.Eventually(ctx, getKubeletMetrics, 15*time.Second, time.Second).Should(gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_working_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_working_pods{config="desired", lifecycle="sync", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="sync", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="sync", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="sync", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="sync", static="unknown"}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminating", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminating", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminating", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminating", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="terminating", static="unknown"}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminated", static=""}`: timelessSample(0),
`kubelet_working_pods{config="desired", lifecycle="terminated", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminated", static=""}`: timelessSample(0),
`kubelet_working_pods{config="orphan", lifecycle="terminated", static="true"}`: timelessSample(0),
`kubelet_working_pods{config="runtime_only", lifecycle="terminated", static="unknown"}`: timelessSample(0),
}),
"kubelet_mirror_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_mirror_pods`: timelessSample(0),
}),
"kubelet_active_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_active_pods{static=""}`: timelessSample(0),
`kubelet_active_pods{static="true"}`: timelessSample(0),
}),
"kubelet_desired_pods": gstruct.MatchElements(sampleLabelID, 0, gstruct.Elements{
`kubelet_desired_pods{static=""}`: timelessSample(0),
`kubelet_desired_pods{static="true"}`: timelessSample(0),
}),
}))
})
ginkgo.AfterEach(func(ctx context.Context) {
ginkgo.By("starting the container runtime")
err := startContainerRuntime()
framework.ExpectNoError(err, "expected no error starting the container runtime")
ginkgo.By("waiting for the container runtime to start")
gomega.Eventually(ctx, func(ctx context.Context) error {
_, _, err := getCRIClient()
if err != nil {
return fmt.Errorf("error getting cri client: %v", err)
}
return nil
}, 2*time.Minute, time.Second*5).Should(gomega.Succeed())
})
})
ginkgo.AfterEach(func(ctx context.Context) {
ginkgo.By("delete the static pod")
err := deleteStaticPod(podPath, staticPodName, ns)
@@ -197,3 +367,8 @@ func checkMirrorPodRunningWithUID(ctx context.Context, cl clientset.Interface, n
}
return nil
}
func sampleLabelID(element interface{}) string {
el := element.(*model.Sample)
return el.Metric.String()
}

View File

@@ -18,7 +18,6 @@ package e2enode
import (
"context"
goerrors "errors"
"fmt"
"os"
"path/filepath"
@@ -40,11 +39,13 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"k8s.io/cli-runtime/pkg/printers"
e2evolume "k8s.io/kubernetes/test/e2e/framework/volume"
)
var _ = SIGDescribe("MirrorPod", func() {
f := framework.NewDefaultFramework("mirror-pod")
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
ginkgo.Context("when create a mirror pod ", func() {
var ns, podPath, staticPodName, mirrorPodName string
ginkgo.BeforeEach(func(ctx context.Context) {
@@ -196,8 +197,179 @@ var _ = SIGDescribe("MirrorPod", func() {
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
})
})
ginkgo.Context("when recreating a static pod", func() {
var ns, podPath, staticPodName, mirrorPodName string
ginkgo.It("it should launch successfully even if it temporarily failed termination due to volume failing to unmount [NodeConformance] [Serial]", func(ctx context.Context) {
node := getNodeName(ctx, f)
ns = f.Namespace.Name
c := f.ClientSet
nfsTestConfig, nfsServerPod, nfsServerHost := e2evolume.NewNFSServerWithNodeName(ctx, c, ns, []string{"-G", "777", "/exports"}, node)
ginkgo.DeferCleanup(func(ctx context.Context) {
framework.Logf("Cleaning up NFS server pod")
e2evolume.TestServerCleanup(ctx, f, nfsTestConfig)
})
podPath = framework.TestContext.KubeletConfig.StaticPodPath
staticPodName = "static-pod-nfs-test-pod" + string(uuid.NewUUID())
mirrorPodName = staticPodName + "-" + framework.TestContext.NodeName
ginkgo.By(fmt.Sprintf("Creating nfs test pod: %s", staticPodName))
err := createStaticPodUsingNfs(nfsServerHost, node, "sleep 999999", podPath, staticPodName, ns)
framework.ExpectNoError(err)
ginkgo.By(fmt.Sprintf("Wating for nfs test pod: %s to start running...", staticPodName))
gomega.Eventually(func() error {
return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns)
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
mirrorPod, err := c.CoreV1().Pods(ns).Get(ctx, mirrorPodName, metav1.GetOptions{})
framework.ExpectNoError(err)
hash, ok := mirrorPod.Annotations[kubetypes.ConfigHashAnnotationKey]
if !ok || hash == "" {
framework.Failf("Failed to get hash for mirrorPod")
}
ginkgo.By("Stopping the NFS server")
stopNfsServer(f, nfsServerPod)
ginkgo.By("Waiting for NFS server to stop...")
time.Sleep(30 * time.Second)
ginkgo.By(fmt.Sprintf("Deleting the static nfs test pod: %s", staticPodName))
err = deleteStaticPod(podPath, staticPodName, ns)
framework.ExpectNoError(err)
// Wait 5 mins for syncTerminatedPod to fail. We expect that the pod volume should not be cleaned up because the NFS server is down.
gomega.Consistently(func() bool {
return podVolumeDirectoryExists(types.UID(hash))
}, 5*time.Minute, 10*time.Second).Should(gomega.BeTrue(), "pod volume should exist while nfs server is stopped")
ginkgo.By("Start the NFS server")
restartNfsServer(f, nfsServerPod)
ginkgo.By("Waiting for the pod volume to deleted after the NFS server is started")
gomega.Eventually(func() bool {
return podVolumeDirectoryExists(types.UID(hash))
}, 5*time.Minute, 10*time.Second).Should(gomega.BeFalse(), "pod volume should be deleted after nfs server is started")
// Create the static pod again with the same config and expect it to start running
err = createStaticPodUsingNfs(nfsServerHost, node, "sleep 999999", podPath, staticPodName, ns)
framework.ExpectNoError(err)
ginkgo.By(fmt.Sprintf("Wating for nfs test pod: %s to start running (after being recreated)", staticPodName))
gomega.Eventually(func() error {
return checkMirrorPodRunning(ctx, f.ClientSet, mirrorPodName, ns)
}, 5*time.Minute, 5*time.Second).Should(gomega.BeNil())
})
ginkgo.AfterEach(func(ctx context.Context) {
ginkgo.By("delete the static pod")
err := deleteStaticPod(podPath, staticPodName, ns)
framework.ExpectNoError(err)
ginkgo.By("wait for the mirror pod to disappear")
gomega.Eventually(ctx, func(ctx context.Context) error {
return checkMirrorPodDisappear(ctx, f.ClientSet, mirrorPodName, ns)
}, 2*time.Minute, time.Second*4).Should(gomega.BeNil())
})
})
})
func podVolumeDirectoryExists(uid types.UID) bool {
podVolumePath := fmt.Sprintf("/var/lib/kubelet/pods/%s/volumes/", uid)
var podVolumeDirectoryExists bool
if _, err := os.Stat(podVolumePath); !os.IsNotExist(err) {
podVolumeDirectoryExists = true
}
return podVolumeDirectoryExists
}
// Restart the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 1` command in the
// pod's (only) container. This command changes the number of nfs server threads from
// (presumably) zero back to 1, and therefore allows nfs to open connections again.
func restartNfsServer(f *framework.Framework, serverPod *v1.Pod) {
const startcmd = "/usr/sbin/rpc.nfsd 1"
_, _, err := e2evolume.PodExec(f, serverPod, startcmd)
framework.ExpectNoError(err)
}
// Stop the passed-in nfs-server by issuing a `/usr/sbin/rpc.nfsd 0` command in the
// pod's (only) container. This command changes the number of nfs server threads to 0,
// thus closing all open nfs connections.
func stopNfsServer(f *framework.Framework, serverPod *v1.Pod) {
const stopcmd = "/usr/sbin/rpc.nfsd 0"
_, _, err := e2evolume.PodExec(f, serverPod, stopcmd)
framework.ExpectNoError(err)
}
func createStaticPodUsingNfs(nfsIP string, nodeName string, cmd string, dir string, name string, ns string) error {
ginkgo.By("create pod using nfs volume")
isPrivileged := true
cmdLine := []string{"-c", cmd}
pod := &v1.Pod{
TypeMeta: metav1.TypeMeta{
Kind: "Pod",
APIVersion: "v1",
},
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: ns,
},
Spec: v1.PodSpec{
NodeName: nodeName,
Containers: []v1.Container{
{
Name: "pod-nfs-vol",
Image: imageutils.GetE2EImage(imageutils.BusyBox),
Command: []string{"/bin/sh"},
Args: cmdLine,
VolumeMounts: []v1.VolumeMount{
{
Name: "nfs-vol",
MountPath: "/mnt",
},
},
SecurityContext: &v1.SecurityContext{
Privileged: &isPrivileged,
},
},
},
RestartPolicy: v1.RestartPolicyNever, //don't restart pod
Volumes: []v1.Volume{
{
Name: "nfs-vol",
VolumeSource: v1.VolumeSource{
NFS: &v1.NFSVolumeSource{
Server: nfsIP,
Path: "/exports",
ReadOnly: false,
},
},
},
},
},
}
file := staticPodPath(dir, name, ns)
f, err := os.OpenFile(file, os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666)
if err != nil {
return err
}
defer f.Close()
y := printers.YAMLPrinter{}
y.PrintObj(pod, f)
return nil
}
func staticPodPath(dir, name, namespace string) string {
return filepath.Join(dir, namespace+"-"+name+".yaml")
}
@@ -238,7 +410,10 @@ func checkMirrorPodDisappear(ctx context.Context, cl clientset.Interface, name,
if apierrors.IsNotFound(err) {
return nil
}
return goerrors.New("pod not disappear")
if err == nil {
return fmt.Errorf("mirror pod %v/%v still exists", namespace, name)
}
return fmt.Errorf("expect mirror pod %v/%v to not exist but got error: %w", namespace, name, err)
}
func checkMirrorPodRunning(ctx context.Context, cl clientset.Interface, name, namespace string) error {

View File

@@ -87,15 +87,6 @@ func (n *NodeE2ERemote) SetupTestPackage(tardir, systemSpecName string) error {
return nil
}
// prependCOSMounterFlag prepends the flag for setting the GCI mounter path to
// args and returns the result.
func prependCOSMounterFlag(args, host, workspace string) (string, error) {
klog.V(2).Infof("GCI/COS node and GCI/COS mounter both detected, modifying --experimental-mounter-path accordingly")
mounterPath := filepath.Join(workspace, "mounter")
args = fmt.Sprintf("--kubelet-flags=--experimental-mounter-path=%s ", mounterPath) + args
return args, nil
}
// prependMemcgNotificationFlag prepends the flag for enabling memcg
// notification to args and returns the result.
func prependMemcgNotificationFlag(args string) string {
@@ -124,8 +115,7 @@ func osSpecificActions(args, host, workspace string) (string, error) {
return args, setKubeletSELinuxLabels(host, workspace)
case strings.Contains(output, "gci"), strings.Contains(output, "cos"):
args = prependMemcgNotificationFlag(args)
args = prependGCPCredentialProviderFlag(args, workspace)
return prependCOSMounterFlag(args, host, workspace)
return prependGCPCredentialProviderFlag(args, workspace), nil
case strings.Contains(output, "ubuntu"):
args = prependGCPCredentialProviderFlag(args, workspace)
return prependMemcgNotificationFlag(args), nil

View File

@@ -23,11 +23,8 @@ import (
"bytes"
"context"
"fmt"
"io"
"log"
"os"
"sort"
"strconv"
"strings"
"sync"
"text/tabwriter"
@@ -39,11 +36,9 @@ import (
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/wait"
kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/util/procfs"
"k8s.io/kubernetes/test/e2e/framework"
e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -465,38 +460,6 @@ func (r *ResourceCollector) GetResourceTimeSeries() map[string]*perftype.Resourc
const kubeletProcessName = "kubelet"
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) > 0 {
pid, err := getPidFromPidFile(pidFile)
if err == nil {
return []int{pid}, nil
}
// log the error and fall back to pidof
runtime.HandleError(err)
}
return procfs.PidOf(name)
}
func getPidFromPidFile(pidFile string) (int, error) {
file, err := os.Open(pidFile)
if err != nil {
return 0, fmt.Errorf("error opening pid file %s: %w", pidFile, err)
}
defer file.Close()
data, err := io.ReadAll(file)
if err != nil {
return 0, fmt.Errorf("error reading pid file %s: %w", pidFile, err)
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return 0, fmt.Errorf("error parsing %s as a number: %w", string(data), err)
}
return pid, nil
}
func getContainerNameForProcess(name, pidFile string) (string, error) {
pids, err := getPidsForProcess(name, pidFile)
if err != nil {

View File

@@ -25,17 +25,21 @@ import (
"io"
"net"
"net/http"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
"time"
"k8s.io/kubernetes/pkg/util/procfs"
oteltrace "go.opentelemetry.io/otel/trace"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
clientset "k8s.io/client-go/kubernetes"
@@ -55,6 +59,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util"
"github.com/coreos/go-systemd/v22/dbus"
"k8s.io/kubernetes/test/e2e/framework"
e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
@@ -84,12 +89,14 @@ const (
var kubeletHealthCheckURL = fmt.Sprintf("http://127.0.0.1:%d/healthz", ports.KubeletHealthzPort)
var containerRuntimeUnitName = ""
func getNodeSummary(ctx context.Context) (*stats.Summary, error) {
kubeletConfig, err := getCurrentKubeletConfig(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get current kubelet config")
}
req, err := http.NewRequest("GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil)
req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s/stats/summary", net.JoinHostPort(kubeletConfig.Address, strconv.Itoa(int(kubeletConfig.ReadOnlyPort)))), nil)
if err != nil {
return nil, fmt.Errorf("failed to build http request: %w", err)
}
@@ -340,6 +347,71 @@ func findKubeletServiceName(running bool) string {
return kubeletServiceName
}
func findContainerRuntimeServiceName() (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := dbus.NewWithContext(ctx)
framework.ExpectNoError(err, "Failed to setup dbus connection")
defer conn.Close()
runtimePids, err := getPidsForProcess(framework.TestContext.ContainerRuntimeProcessName, framework.TestContext.ContainerRuntimePidFile)
framework.ExpectNoError(err, "failed to get list of container runtime pids")
framework.ExpectEqual(len(runtimePids), 1, "Unexpected number of container runtime pids. Expected 1 but got %v", len(runtimePids))
containerRuntimePid := runtimePids[0]
unitName, err := conn.GetUnitNameByPID(ctx, uint32(containerRuntimePid))
framework.ExpectNoError(err, "Failed to get container runtime unit name")
return unitName, nil
}
type containerRuntimeUnitOp int
const (
startContainerRuntimeUnitOp containerRuntimeUnitOp = iota
stopContainerRuntimeUnitOp
)
func performContainerRuntimeUnitOp(op containerRuntimeUnitOp) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := dbus.NewWithContext(ctx)
framework.ExpectNoError(err, "Failed to setup dbus connection")
defer conn.Close()
if containerRuntimeUnitName == "" {
containerRuntimeUnitName, err = findContainerRuntimeServiceName()
framework.ExpectNoError(err, "Failed to find container runtime name")
}
reschan := make(chan string)
switch op {
case startContainerRuntimeUnitOp:
conn.StartUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
case stopContainerRuntimeUnitOp:
conn.StopUnitContext(ctx, containerRuntimeUnitName, "replace", reschan)
default:
framework.Failf("Unexpected container runtime op: %v", op)
}
job := <-reschan
framework.ExpectEqual(job, "done", "Expected job to complete with done")
return nil
}
func stopContainerRuntime() error {
return performContainerRuntimeUnitOp(stopContainerRuntimeUnitOp)
}
func startContainerRuntime() error {
return performContainerRuntimeUnitOp(startContainerRuntimeUnitOp)
}
// restartKubelet restarts the current kubelet service.
// the "current" kubelet service is the instance managed by the current e2e_node test run.
// If `running` is true, restarts only if the current kubelet is actually running. In some cases,
@@ -465,3 +537,35 @@ func waitForAllContainerRemoval(ctx context.Context, podName, podNS string) {
return nil
}, 2*time.Minute, 1*time.Second).Should(gomega.Succeed())
}
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) > 0 {
pid, err := getPidFromPidFile(pidFile)
if err == nil {
return []int{pid}, nil
}
// log the error and fall back to pidof
runtime.HandleError(err)
}
return procfs.PidOf(name)
}
func getPidFromPidFile(pidFile string) (int, error) {
file, err := os.Open(pidFile)
if err != nil {
return 0, fmt.Errorf("error opening pid file %s: %v", pidFile, err)
}
defer file.Close()
data, err := io.ReadAll(file)
if err != nil {
return 0, fmt.Errorf("error reading pid file %s: %v", pidFile, err)
}
pid, err := strconv.Atoi(string(data))
if err != nil {
return 0, fmt.Errorf("error parsing %s as a number: %v", string(data), err)
}
return pid, nil
}