From b9d00841a62a2d11a71ba7fefe5a0af41d040034 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 17 Jul 2024 14:08:31 +0200 Subject: [PATCH] kubelet: improve checkpoint errors Recording the expected and actual checksum in the error makes it possible to provide that information, for example in a failed test like the ones for DRA. Otherwise developers have to manually step through the test with a debugger to figure out what the new checksum is. --- .../checkpointmanager/checksum/checksum.go | 5 ++-- .../checkpointmanager/errors/errors.go | 24 +++++++++++++++++-- pkg/kubelet/cm/cpumanager/state/checkpoint.go | 8 +++++-- pkg/kubelet/cm/dra/state/checkpoint.go | 13 ++++++---- pkg/kubelet/cm/dra/state/state_checkpoint.go | 2 +- .../cm/dra/state/state_checkpoint_test.go | 17 +++++++++++-- 6 files changed, 56 insertions(+), 13 deletions(-) diff --git a/pkg/kubelet/checkpointmanager/checksum/checksum.go b/pkg/kubelet/checkpointmanager/checksum/checksum.go index d5892b7be65..5dcd4f24194 100644 --- a/pkg/kubelet/checkpointmanager/checksum/checksum.go +++ b/pkg/kubelet/checkpointmanager/checksum/checksum.go @@ -28,8 +28,9 @@ type Checksum uint64 // Verify verifies that passed checksum is same as calculated checksum func (cs Checksum) Verify(data interface{}) error { - if cs != New(data) { - return errors.ErrCorruptCheckpoint + actualCS := New(data) + if cs != actualCS { + return &errors.CorruptCheckpointError{ActualCS: uint64(actualCS), ExpectedCS: uint64(cs)} } return nil } diff --git a/pkg/kubelet/checkpointmanager/errors/errors.go b/pkg/kubelet/checkpointmanager/errors/errors.go index bb445f207ec..558f9c7e074 100644 --- a/pkg/kubelet/checkpointmanager/errors/errors.go +++ b/pkg/kubelet/checkpointmanager/errors/errors.go @@ -18,8 +18,28 @@ package errors import "fmt" -// ErrCorruptCheckpoint error is reported when checksum does not match -var ErrCorruptCheckpoint = fmt.Errorf("checkpoint is corrupted") +// ErrCorruptCheckpoint error is reported when checksum does not match. +// Check for it with: +// +// var csErr *CorruptCheckpointError +// if errors.As(err, &csErr) { ... } +// if errors.Is(err, CorruptCheckpointError{}) { ... } +type CorruptCheckpointError struct { + ActualCS, ExpectedCS uint64 +} + +func (err CorruptCheckpointError) Error() string { + return "checkpoint is corrupted" +} + +func (err CorruptCheckpointError) Is(target error) bool { + switch target.(type) { + case *CorruptCheckpointError, CorruptCheckpointError: + return true + default: + return false + } +} // ErrCheckpointNotFound is reported when checkpoint is not found for a given key var ErrCheckpointNotFound = fmt.Errorf("checkpoint is not found") diff --git a/pkg/kubelet/cm/cpumanager/state/checkpoint.go b/pkg/kubelet/cm/cpumanager/state/checkpoint.go index eb2bfa27eaf..564c3482c04 100644 --- a/pkg/kubelet/cm/cpumanager/state/checkpoint.go +++ b/pkg/kubelet/cm/cpumanager/state/checkpoint.go @@ -110,8 +110,12 @@ func (cp *CPUManagerCheckpointV1) VerifyChecksum() error { hash := fnv.New32a() fmt.Fprintf(hash, "%v", object) - if cp.Checksum != checksum.Checksum(hash.Sum32()) { - return errors.ErrCorruptCheckpoint + actualCS := checksum.Checksum(hash.Sum32()) + if cp.Checksum != actualCS { + return &errors.CorruptCheckpointError{ + ActualCS: uint64(actualCS), + ExpectedCS: uint64(cp.Checksum), + } } return nil diff --git a/pkg/kubelet/cm/dra/state/checkpoint.go b/pkg/kubelet/cm/dra/state/checkpoint.go index 7c44f12eea9..6ea49c62e99 100644 --- a/pkg/kubelet/cm/dra/state/checkpoint.go +++ b/pkg/kubelet/cm/dra/state/checkpoint.go @@ -18,6 +18,7 @@ package state import ( "encoding/json" + "errors" "fmt" "hash/fnv" "strings" @@ -25,7 +26,7 @@ import ( "k8s.io/apimachinery/pkg/util/dump" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" - "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" + cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ) var _ checkpointmanager.Checkpoint = &DRAManagerCheckpoint{} @@ -79,7 +80,7 @@ func (dc *DRAManagerCheckpoint) VerifyChecksum() error { ck := dc.Checksum dc.Checksum = 0 err := ck.Verify(dc) - if err == errors.ErrCorruptCheckpoint { + if errors.Is(err, cmerrors.CorruptCheckpointError{}) { // Verify with old structs without ResourceHandles field // TODO: remove in Beta err = verifyChecksumWithoutResourceHandles(dc, ck) @@ -115,8 +116,12 @@ func verifyChecksumWithoutResourceHandles(dc *DRAManagerCheckpoint, checkSum che object = strings.Replace(object, "ClaimInfoStateListWithoutResourceHandles", "ClaimInfoStateList", 1) hash := fnv.New32a() fmt.Fprintf(hash, "%v", object) - if checkSum != checksum.Checksum(hash.Sum32()) { - return errors.ErrCorruptCheckpoint + actualCS := checksum.Checksum(hash.Sum32()) + if checkSum != actualCS { + return &cmerrors.CorruptCheckpointError{ + ActualCS: uint64(actualCS), + ExpectedCS: uint64(checkSum), + } } return nil } diff --git a/pkg/kubelet/cm/dra/state/state_checkpoint.go b/pkg/kubelet/cm/dra/state/state_checkpoint.go index a82f6b11bb4..46e837b136c 100644 --- a/pkg/kubelet/cm/dra/state/state_checkpoint.go +++ b/pkg/kubelet/cm/dra/state/state_checkpoint.go @@ -126,7 +126,7 @@ func (sc *stateCheckpoint) GetOrCreate() (ClaimInfoStateList, error) { return ClaimInfoStateList{}, nil } if err != nil { - return nil, fmt.Errorf("failed to get checkpoint %v: %v", sc.checkpointName, err) + return nil, fmt.Errorf("failed to get checkpoint %v: %w", sc.checkpointName, err) } return checkpoint.Entries, nil diff --git a/pkg/kubelet/cm/dra/state/state_checkpoint_test.go b/pkg/kubelet/cm/dra/state/state_checkpoint_test.go index a5ff86eed17..cb930bdc8fe 100644 --- a/pkg/kubelet/cm/dra/state/state_checkpoint_test.go +++ b/pkg/kubelet/cm/dra/state/state_checkpoint_test.go @@ -17,16 +17,19 @@ limitations under the License. package state import ( + "errors" "os" "path" "strings" "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" resourcev1alpha2 "k8s.io/api/resource/v1alpha2" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" + cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" testutil "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/testing" ) @@ -198,7 +201,7 @@ func TestCheckpointGetOrCreate(t *testing.T) { assert.Error(t, err) assert.Contains(t, err.Error(), tc.expectedError) } else { - assert.NoError(t, err, "unexpected error while creating checkpointState") + requireNoCheckpointError(t, err) // compare state after restoration with the one expected assertStateEqual(t, state, tc.expectedState) } @@ -270,7 +273,7 @@ func TestOldCheckpointRestore(t *testing.T) { checkpoint := NewDRAManagerCheckpoint() err = cpm.GetCheckpoint(testingCheckpoint, checkpoint) - assert.NoError(t, err, "could not restore checkpoint") + requireNoCheckpointError(t, err) checkpointData, err := checkpoint.MarshalCheckpoint() assert.NoError(t, err, "could not Marshal Checkpoint") @@ -278,3 +281,13 @@ func TestOldCheckpointRestore(t *testing.T) { expectedData := `{"version":"v1","entries":[{"DriverName":"test-driver.cdi.k8s.io","ClassName":"class-name","ClaimUID":"067798be-454e-4be4-9047-1aa06aea63f7","ClaimName":"example","Namespace":"default","PodUIDs":{"139cdb46-f989-4f17-9561-ca10cfb509a6":{}},"ResourceHandles":null,"CDIDevices":{"test-driver.cdi.k8s.io":["example.com/example=cdi-example"]}}],"checksum":453625682}` assert.Equal(t, expectedData, string(checkpointData), "expected ClaimInfoState does not equal to restored one") } + +func requireNoCheckpointError(t *testing.T, err error) { + t.Helper() + var cksumErr *cmerrors.CorruptCheckpointError + if errors.As(err, &cksumErr) { + t.Fatalf("unexpected corrupt checkpoint, expected checksum %d, got %d", cksumErr.ExpectedCS, cksumErr.ActualCS) + } else { + require.NoError(t, err, "could not restore checkpoint") + } +}