From 181fb0da5157e9aa56fc10f2c882efd354616623 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Tue, 20 Feb 2024 17:40:23 +0100 Subject: [PATCH] node: devicemgr: remove obsolete pre-1.20 checkpoint file support In commit 2f426fdba6fa0458c4a47dd22b37cc920d9f2a98 we added compatibility (and tests) to deal with pre-1.20 checkpoint files. We are now well past the end of support for pre-1.20 kubelets, so we can get rid of this code. Signed-off-by: Francesco Romani --- .../cm/devicemanager/checkpoint/checkpoint.go | 6 +- .../devicemanager/checkpoint/checkpointv1.go | 117 ------- pkg/kubelet/cm/devicemanager/manager.go | 30 +- pkg/kubelet/cm/devicemanager/manager_test.go | 22 -- test/e2e_node/device_manager_test.go | 306 ------------------ test/e2e_node/util.go | 12 - 6 files changed, 7 insertions(+), 486 deletions(-) delete mode 100644 pkg/kubelet/cm/devicemanager/checkpoint/checkpointv1.go diff --git a/pkg/kubelet/cm/devicemanager/checkpoint/checkpoint.go b/pkg/kubelet/cm/devicemanager/checkpoint/checkpoint.go index 46d0aaa2a25..b9dcdaf829d 100644 --- a/pkg/kubelet/cm/devicemanager/checkpoint/checkpoint.go +++ b/pkg/kubelet/cm/devicemanager/checkpoint/checkpoint.go @@ -27,7 +27,7 @@ import ( // DeviceManagerCheckpoint defines the operations to retrieve pod devices type DeviceManagerCheckpoint interface { checkpointmanager.Checkpoint - GetDataInLatestFormat() ([]PodDevicesEntry, map[string][]string) + GetData() ([]PodDevicesEntry, map[string][]string) } // DevicesPerNUMA represents device ids obtained from device plugin per NUMA node id @@ -102,8 +102,8 @@ func (cp *Data) VerifyChecksum() error { return cp.Checksum.Verify(cp.Data) } -// GetDataInLatestFormat returns device entries and registered devices in the *most recent* +// GetData returns device entries and registered devices in the *most recent* // checkpoint format, *not* in the original format stored on disk. -func (cp *Data) GetDataInLatestFormat() ([]PodDevicesEntry, map[string][]string) { +func (cp *Data) GetData() ([]PodDevicesEntry, map[string][]string) { return cp.Data.PodDeviceEntries, cp.Data.RegisteredDevices } diff --git a/pkg/kubelet/cm/devicemanager/checkpoint/checkpointv1.go b/pkg/kubelet/cm/devicemanager/checkpoint/checkpointv1.go deleted file mode 100644 index 17c9650e57f..00000000000 --- a/pkg/kubelet/cm/devicemanager/checkpoint/checkpointv1.go +++ /dev/null @@ -1,117 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package checkpoint - -import ( - "encoding/json" - "fmt" - "hash/fnv" - "strings" - - "k8s.io/apimachinery/pkg/util/dump" - "k8s.io/klog/v2" - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" -) - -// PodDevicesEntryV1 connects pod information to devices, without topology information (k8s <= 1.19) -type PodDevicesEntryV1 struct { - PodUID string - ContainerName string - ResourceName string - DeviceIDs []string - AllocResp []byte -} - -// checkpointDataV1 struct is used to store pod to device allocation information -// in a checkpoint file, without topology information (k8s <= 1.19) -type checkpointDataV1 struct { - PodDeviceEntries []PodDevicesEntryV1 - RegisteredDevices map[string][]string -} - -// checksum compute the checksum using the same algorithms (and data type names) k8s 1.19 used. -// We need this special code path to be able to correctly validate the checksum k8s 1.19 wrote. -// credits to https://github.com/kubernetes/kubernetes/pull/102717/commits/353f93895118d2ffa2d59a29a1fbc225160ea1d6 -func (cp checkpointDataV1) checksum() checksum.Checksum { - object := dump.ForHash(cp) - object = strings.Replace(object, "checkpointDataV1", "checkpointData", 1) - object = strings.Replace(object, "PodDevicesEntryV1", "PodDevicesEntry", -1) - hash := fnv.New32a() - fmt.Fprintf(hash, "%v", object) - return checksum.Checksum(hash.Sum32()) -} - -// DataV1 holds checkpoint data and its checksum, in V1 (k8s <= 1.19) format -type DataV1 struct { - Data checkpointDataV1 - Checksum checksum.Checksum -} - -// NewV1 returns an instance of Checkpoint, in V1 (k8s <= 1.19) format. -// Users should avoid creating checkpoints in formats different from the most recent one, -// use the old formats only to validate existing checkpoint and convert them to most recent -// format. The only exception should be test code. -func NewV1(devEntries []PodDevicesEntryV1, - devices map[string][]string) DeviceManagerCheckpoint { - return &DataV1{ - Data: checkpointDataV1{ - PodDeviceEntries: devEntries, - RegisteredDevices: devices, - }, - } -} - -// MarshalCheckpoint is needed to implement the Checkpoint interface, but should not be called anymore -func (cp *DataV1) MarshalCheckpoint() ([]byte, error) { - klog.InfoS("Marshalling a device manager V1 checkpoint") - cp.Checksum = cp.Data.checksum() - return json.Marshal(*cp) -} - -// UnmarshalCheckpoint returns unmarshalled data -func (cp *DataV1) UnmarshalCheckpoint(blob []byte) error { - return json.Unmarshal(blob, cp) -} - -// VerifyChecksum verifies that passed checksum is same as calculated checksum -func (cp *DataV1) VerifyChecksum() error { - if cp.Checksum != cp.Data.checksum() { - return errors.ErrCorruptCheckpoint - } - return nil -} - -// GetDataInLatestFormat returns device entries and registered devices in the *most recent* -// checkpoint format, *not* in the original format stored on disk. -func (cp *DataV1) GetDataInLatestFormat() ([]PodDevicesEntry, map[string][]string) { - var podDevs []PodDevicesEntry - for _, entryV1 := range cp.Data.PodDeviceEntries { - devsPerNuma := NewDevicesPerNUMA() - // no NUMA cell affinity was recorded. The only possible choice - // is to set all the devices affine to node 0. - devsPerNuma[0] = entryV1.DeviceIDs - podDevs = append(podDevs, PodDevicesEntry{ - PodUID: entryV1.PodUID, - ContainerName: entryV1.ContainerName, - ResourceName: entryV1.ResourceName, - DeviceIDs: devsPerNuma, - AllocResp: entryV1.AllocResp, - }) - } - return podDevs, cp.Data.RegisteredDevices -} diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go index f1d04e97179..ae0219c66d1 100644 --- a/pkg/kubelet/cm/devicemanager/manager.go +++ b/pkg/kubelet/cm/devicemanager/manager.go @@ -468,33 +468,19 @@ func (m *ManagerImpl) writeCheckpoint() error { // Reads device to container allocation information from disk, and populates // m.allocatedDevices accordingly. func (m *ManagerImpl) readCheckpoint() error { - // the vast majority of time we restore a compatible checkpoint, so we try - // the current version first. Trying to restore older format checkpoints is - // relevant only in the kubelet upgrade flow, which happens once in a - // (long) while. - cp, err := m.getCheckpointV2() + cp, err := m.getCheckpoint() if err != nil { if err == errors.ErrCheckpointNotFound { // no point in trying anything else klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err) return nil } - - var errv1 error - // one last try: maybe it's a old format checkpoint? - cp, errv1 = m.getCheckpointV1() - if errv1 != nil { - klog.InfoS("Failed to read checkpoint V1 file", "err", errv1) - // intentionally return the parent error. We expect to restore V1 checkpoints - // a tiny fraction of time, so what matters most is the current checkpoint read error. - return err - } - klog.InfoS("Read data from a V1 checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint) + return err } m.mutex.Lock() defer m.mutex.Unlock() - podDevices, registeredDevs := cp.GetDataInLatestFormat() + podDevices, registeredDevs := cp.GetData() m.podDevices.fromCheckpointData(podDevices) m.allocatedDevices = m.podDevices.devices() for resource := range registeredDevs { @@ -507,7 +493,7 @@ func (m *ManagerImpl) readCheckpoint() error { return nil } -func (m *ManagerImpl) getCheckpointV2() (checkpoint.DeviceManagerCheckpoint, error) { +func (m *ManagerImpl) getCheckpoint() (checkpoint.DeviceManagerCheckpoint, error) { registeredDevs := make(map[string][]string) devEntries := make([]checkpoint.PodDevicesEntry, 0) cp := checkpoint.New(devEntries, registeredDevs) @@ -515,14 +501,6 @@ func (m *ManagerImpl) getCheckpointV2() (checkpoint.DeviceManagerCheckpoint, err return cp, err } -func (m *ManagerImpl) getCheckpointV1() (checkpoint.DeviceManagerCheckpoint, error) { - registeredDevs := make(map[string][]string) - devEntries := make([]checkpoint.PodDevicesEntryV1, 0) - cp := checkpoint.NewV1(devEntries, registeredDevs) - err := m.checkpointManager.GetCheckpoint(kubeletDeviceManagerCheckpoint, cp) - return cp, err -} - // UpdateAllocatedDevices frees any Devices that are bound to terminated pods. func (m *ManagerImpl) UpdateAllocatedDevices() { if !m.sourcesReady.AllReady() { diff --git a/pkg/kubelet/cm/devicemanager/manager_test.go b/pkg/kubelet/cm/devicemanager/manager_test.go index 14d2f0d3e8c..6b5ec623c28 100644 --- a/pkg/kubelet/cm/devicemanager/manager_test.go +++ b/pkg/kubelet/cm/devicemanager/manager_test.go @@ -1771,28 +1771,6 @@ func makeDevice(devOnNUMA checkpoint.DevicesPerNUMA, topology bool) map[string]p return res } -const deviceManagerCheckpointFilename = "kubelet_internal_checkpoint" - -var oldCheckpoint string = `{"Data":{"PodDeviceEntries":[{"PodUID":"13ac2284-0d19-44b7-b94f-055b032dba9b","ContainerName":"centos","ResourceName":"example.com/deviceA","DeviceIDs":["DevA3"],"AllocResp":"CiIKHUVYQU1QTEVDT01ERVZJQ0VBX0RFVkEzX1RUWTEwEgEwGhwKCi9kZXYvdHR5MTASCi9kZXYvdHR5MTAaAnJ3"},{"PodUID":"86b9a017-c9ca-4069-815f-46ca3e53c1e4","ContainerName":"centos","ResourceName":"example.com/deviceA","DeviceIDs":["DevA4"],"AllocResp":"CiIKHUVYQU1QTEVDT01ERVZJQ0VBX0RFVkE0X1RUWTExEgEwGhwKCi9kZXYvdHR5MTESCi9kZXYvdHR5MTEaAnJ3"}],"RegisteredDevices":{"example.com/deviceA":["DevA1","DevA2","DevA3","DevA4"]}},"Checksum":405612085}` - -func TestReadPreNUMACheckpoint(t *testing.T) { - socketDir, socketName, _, err := tmpSocketDir() - require.NoError(t, err) - defer os.RemoveAll(socketDir) - - err = os.WriteFile(filepath.Join(socketDir, deviceManagerCheckpointFilename), []byte(oldCheckpoint), 0644) - require.NoError(t, err) - - topologyStore := topologymanager.NewFakeManager() - nodes := []cadvisorapi.Node{{Id: 0}} - m, err := newManagerImpl(socketName, nodes, topologyStore) - require.NoError(t, err) - - // TODO: we should not calling private methods, but among the existing tests we do anyway - err = m.readCheckpoint() - require.NoError(t, err) -} - func TestGetTopologyHintsWithUpdates(t *testing.T) { socketDir, socketName, _, err := tmpSocketDir() defer os.RemoveAll(socketDir) diff --git a/test/e2e_node/device_manager_test.go b/test/e2e_node/device_manager_test.go index c290a02d7bd..df1715b55c0 100644 --- a/test/e2e_node/device_manager_test.go +++ b/test/e2e_node/device_manager_test.go @@ -22,8 +22,6 @@ import ( "fmt" "os" "path/filepath" - "regexp" - "sort" "strings" "time" @@ -33,18 +31,12 @@ import ( "k8s.io/apimachinery/pkg/util/uuid" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2" - kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" - "k8s.io/kubernetes/pkg/kubelet/apis/podresources" - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" - "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint" - "k8s.io/kubernetes/pkg/kubelet/util" admissionapi "k8s.io/pod-security-admission/api" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" - e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles" "k8s.io/kubernetes/test/e2e/nodefeature" testutils "k8s.io/kubernetes/test/utils" @@ -57,227 +49,13 @@ import ( const ( devicePluginDir = "/var/lib/kubelet/device-plugins" - checkpointName = "kubelet_internal_checkpoint" ) // Serial because the test updates kubelet configuration. var _ = SIGDescribe("Device Manager", framework.WithSerial(), feature.DeviceManager, nodefeature.DeviceManager, func() { - checkpointFullPath := filepath.Join(devicePluginDir, checkpointName) f := framework.NewDefaultFramework("devicemanager-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged - ginkgo.Context("With SRIOV devices in the system", func() { - // this test wants to reproduce what happened in https://github.com/kubernetes/kubernetes/issues/102880 - ginkgo.It("should be able to recover V1 (aka pre-1.20) checkpoint data and reject pods before device re-registration", func(ctx context.Context) { - if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount == 0 { - e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device") - } - - configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) - sd := setupSRIOVConfigOrFail(ctx, f, configMap) - - waitForSRIOVResources(ctx, f, sd) - - cntName := "gu-container" - // we create and delete a pod to make sure the internal device manager state contains a pod allocation - ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sd.resourceName)) - var initCtnAttrs []tmCtnAttribute - ctnAttrs := []tmCtnAttribute{ - { - ctnName: cntName, - cpuRequest: "1000m", - cpuLimit: "1000m", - deviceName: sd.resourceName, - deviceRequest: "1", - deviceLimit: "1", - }, - } - - podName := "gu-pod-rec-pre-1" - framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) - pod := makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs) - pod = e2epod.NewPodClient(f).CreateSync(ctx, pod) - - // now we need to simulate a node drain, so we remove all the pods, including the sriov device plugin. - - ginkgo.By("deleting the pod") - // note we delete right now because we know the current implementation of devicemanager will NOT - // clean up on pod deletion. When this changes, the deletion needs to be done after the test is done. - deletePodSyncByName(ctx, f, pod.Name) - waitForAllContainerRemoval(ctx, pod.Name, pod.Namespace) - - ginkgo.By("teardown the sriov device plugin") - // since we will NOT be recreating the plugin, we clean up everything now - teardownSRIOVConfigOrFail(ctx, f, sd) - - ginkgo.By("stopping the kubelet") - killKubelet("SIGSTOP") - - ginkgo.By("rewriting the kubelet checkpoint file as v1") - err := rewriteCheckpointAsV1(devicePluginDir, checkpointName) - // make sure we remove any leftovers - defer os.Remove(checkpointFullPath) - framework.ExpectNoError(err) - - // this mimics a kubelet restart after the upgrade - // TODO: is SIGTERM (less brutal) good enough? - ginkgo.By("killing the kubelet") - killKubelet("SIGKILL") - - ginkgo.By("waiting for the kubelet to be ready again") - // Wait for the Kubelet to be ready. - gomega.Eventually(ctx, func(ctx context.Context) bool { - nodes, err := e2enode.TotalReady(ctx, f.ClientSet) - framework.ExpectNoError(err) - return nodes == 1 - }, time.Minute, time.Second).Should(gomega.BeTrue()) - - // note we DO NOT start the sriov device plugin. This is intentional. - // issue#102880 reproduces because of a race on startup caused by corrupted device manager - // state which leads to v1.Node object not updated on apiserver. - // So to hit the issue we need to receive the pod *before* the device plugin registers itself. - // The simplest and safest way to reproduce is just avoid to run the device plugin again - - podName = "gu-pod-rec-post-2" - framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) - pod = makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs) - - pod = e2epod.NewPodClient(f).Create(ctx, pod) - err = e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Failed", 30*time.Second, func(pod *v1.Pod) (bool, error) { - if pod.Status.Phase != v1.PodPending { - return true, nil - } - return false, nil - }) - framework.ExpectNoError(err) - pod, err = e2epod.NewPodClient(f).Get(ctx, pod.Name, metav1.GetOptions{}) - framework.ExpectNoError(err) - - if pod.Status.Phase != v1.PodFailed { - framework.Failf("pod %s not failed: %v", pod.Name, pod.Status) - } - - framework.Logf("checking pod %s status reason (%s)", pod.Name, pod.Status.Reason) - if !isUnexpectedAdmissionError(pod) { - framework.Failf("pod %s failed for wrong reason: %q", pod.Name, pod.Status.Reason) - } - - deletePodSyncByName(ctx, f, pod.Name) - }) - - ginkgo.It("should be able to recover V1 (aka pre-1.20) checkpoint data and update topology info on device re-registration", func(ctx context.Context) { - if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount == 0 { - e2eskipper.Skipf("this test is meant to run on a system with at least one configured VF from SRIOV device") - } - - endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) - framework.ExpectNoError(err) - - configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) - - sd := setupSRIOVConfigOrFail(ctx, f, configMap) - waitForSRIOVResources(ctx, f, sd) - - cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) - framework.ExpectNoError(err) - - resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) - conn.Close() - framework.ExpectNoError(err) - - suitableDevs := 0 - for _, dev := range resp.GetDevices() { - for _, node := range dev.GetTopology().GetNodes() { - if node.GetID() != 0 { - suitableDevs++ - } - } - } - if suitableDevs == 0 { - teardownSRIOVConfigOrFail(ctx, f, sd) - e2eskipper.Skipf("no devices found on NUMA Cell other than 0") - } - - cntName := "gu-container" - // we create and delete a pod to make sure the internal device manager state contains a pod allocation - ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sd.resourceName)) - var initCtnAttrs []tmCtnAttribute - ctnAttrs := []tmCtnAttribute{ - { - ctnName: cntName, - cpuRequest: "1000m", - cpuLimit: "1000m", - deviceName: sd.resourceName, - deviceRequest: "1", - deviceLimit: "1", - }, - } - - podName := "gu-pod-rec-pre-1" - framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) - pod := makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs) - pod = e2epod.NewPodClient(f).CreateSync(ctx, pod) - - // now we need to simulate a node drain, so we remove all the pods, including the sriov device plugin. - - ginkgo.By("deleting the pod") - // note we delete right now because we know the current implementation of devicemanager will NOT - // clean up on pod deletion. When this changes, the deletion needs to be done after the test is done. - deletePodSyncByName(ctx, f, pod.Name) - waitForAllContainerRemoval(ctx, pod.Name, pod.Namespace) - - ginkgo.By("teardown the sriov device plugin") - // no need to delete the config now (speed up later) - deleteSRIOVPodOrFail(ctx, f, sd) - - ginkgo.By("stopping the kubelet") - killKubelet("SIGSTOP") - - ginkgo.By("rewriting the kubelet checkpoint file as v1") - err = rewriteCheckpointAsV1(devicePluginDir, checkpointName) - // make sure we remove any leftovers - defer os.Remove(checkpointFullPath) - framework.ExpectNoError(err) - - // this mimics a kubelet restart after the upgrade - // TODO: is SIGTERM (less brutal) good enough? - ginkgo.By("killing the kubelet") - killKubelet("SIGKILL") - - ginkgo.By("waiting for the kubelet to be ready again") - // Wait for the Kubelet to be ready. - gomega.Eventually(ctx, func(ctx context.Context) bool { - nodes, err := e2enode.TotalReady(ctx, f.ClientSet) - framework.ExpectNoError(err) - return nodes == 1 - }, time.Minute, time.Second).Should(gomega.BeTrue()) - - sd2 := &sriovData{ - configMap: sd.configMap, - serviceAccount: sd.serviceAccount, - } - sd2.pod = createSRIOVPodOrFail(ctx, f) - ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd2) - waitForSRIOVResources(ctx, f, sd2) - - compareSRIOVResources(sd, sd2) - - cli, conn, err = podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) - framework.ExpectNoError(err) - defer conn.Close() - - resp2, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) - framework.ExpectNoError(err) - - cntDevs := stringifyContainerDevices(resp.GetDevices()) - cntDevs2 := stringifyContainerDevices(resp2.GetDevices()) - if cntDevs != cntDevs2 { - framework.Failf("different allocatable resources expected %v got %v", cntDevs, cntDevs2) - } - }) - - }) - /* This end to end test is to simulate a scenario where after kubelet restart/node reboot application pods requesting devices appear before the device plugin @@ -512,90 +290,6 @@ var _ = SIGDescribe("Device Manager", framework.WithSerial(), feature.DeviceMana }) -func compareSRIOVResources(expected, got *sriovData) { - if expected.resourceName != got.resourceName { - framework.Failf("different SRIOV resource name: expected %q got %q", expected.resourceName, got.resourceName) - } - if expected.resourceAmount != got.resourceAmount { - framework.Failf("different SRIOV resource amount: expected %d got %d", expected.resourceAmount, got.resourceAmount) - } -} - -func isUnexpectedAdmissionError(pod *v1.Pod) bool { - re := regexp.MustCompile(`Unexpected.*Admission.*Error`) - return re.MatchString(pod.Status.Reason) -} - -func rewriteCheckpointAsV1(dir, name string) error { - ginkgo.By(fmt.Sprintf("Creating temporary checkpoint manager (dir=%q)", dir)) - checkpointManager, err := checkpointmanager.NewCheckpointManager(dir) - if err != nil { - return err - } - cp := checkpoint.New(make([]checkpoint.PodDevicesEntry, 0), make(map[string][]string)) - err = checkpointManager.GetCheckpoint(name, cp) - if err != nil { - return err - } - - ginkgo.By(fmt.Sprintf("Read checkpoint %q %#v", name, cp)) - - podDevices, registeredDevs := cp.GetDataInLatestFormat() - podDevicesV1 := convertPodDeviceEntriesToV1(podDevices) - cpV1 := checkpoint.NewV1(podDevicesV1, registeredDevs) - - blob, err := cpV1.MarshalCheckpoint() - if err != nil { - return err - } - - // TODO: why `checkpointManager.CreateCheckpoint(name, cpV1)` doesn't seem to work? - ckPath := filepath.Join(dir, name) - os.WriteFile(filepath.Join("/tmp", name), blob, 0600) - return os.WriteFile(ckPath, blob, 0600) -} - -func convertPodDeviceEntriesToV1(entries []checkpoint.PodDevicesEntry) []checkpoint.PodDevicesEntryV1 { - entriesv1 := []checkpoint.PodDevicesEntryV1{} - for _, entry := range entries { - deviceIDs := []string{} - for _, perNUMANodeDevIDs := range entry.DeviceIDs { - deviceIDs = append(deviceIDs, perNUMANodeDevIDs...) - } - entriesv1 = append(entriesv1, checkpoint.PodDevicesEntryV1{ - PodUID: entry.PodUID, - ContainerName: entry.ContainerName, - ResourceName: entry.ResourceName, - DeviceIDs: deviceIDs, - AllocResp: entry.AllocResp, - }) - } - return entriesv1 -} - -func stringifyContainerDevices(devs []*kubeletpodresourcesv1.ContainerDevices) string { - entries := []string{} - for _, dev := range devs { - devIDs := dev.GetDeviceIds() - if devIDs != nil { - for _, devID := range dev.DeviceIds { - nodes := dev.GetTopology().GetNodes() - if nodes != nil { - for _, node := range nodes { - entries = append(entries, fmt.Sprintf("%s[%s]@NUMA=%d", dev.ResourceName, devID, node.GetID())) - } - } else { - entries = append(entries, fmt.Sprintf("%s[%s]@NUMA=none", dev.ResourceName, devID)) - } - } - } else { - entries = append(entries, dev.ResourceName) - } - } - sort.Strings(entries) - return strings.Join(entries, ", ") -} - func makeBusyboxDeviceRequiringPod(resourceName, cmd string) *v1.Pod { podName := "device-manager-test-" + string(uuid.NewUUID()) rl := v1.ResourceList{ diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index b54e2b364aa..f4ffe7c69b4 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -461,18 +461,6 @@ func stopKubelet() func() { } } -// killKubelet sends a signal (SIGINT, SIGSTOP, SIGTERM...) to the running kubelet -func killKubelet(sig string) { - kubeletServiceName := findKubeletServiceName(true) - - // reset the kubelet service start-limit-hit - stdout, err := exec.Command("sudo", "systemctl", "reset-failed", kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to reset kubelet start-limit-hit with systemctl: %v, %v", err, stdout) - - stdout, err = exec.Command("sudo", "systemctl", "kill", "-s", sig, kubeletServiceName).CombinedOutput() - framework.ExpectNoError(err, "Failed to stop kubelet with systemctl: %v, %v", err, stdout) -} - func kubeletHealthCheck(url string) bool { insecureTransport := http.DefaultTransport.(*http.Transport).Clone() insecureTransport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}