kubelet: improve checkpoint errors

Recording the expected and actual checksum in the error makes it possible to
provide that information, for example in a failed test like the ones for DRA.
Otherwise developers have to manually step through the test with a debugger to
figure out what the new checksum is.
This commit is contained in:
Patrick Ohly 2024-07-17 14:08:31 +02:00
parent 157f4b94d8
commit b9d00841a6
6 changed files with 56 additions and 13 deletions

View File

@ -28,8 +28,9 @@ type Checksum uint64
// Verify verifies that passed checksum is same as calculated checksum
func (cs Checksum) Verify(data interface{}) error {
if cs != New(data) {
return errors.ErrCorruptCheckpoint
actualCS := New(data)
if cs != actualCS {
return &errors.CorruptCheckpointError{ActualCS: uint64(actualCS), ExpectedCS: uint64(cs)}
}
return nil
}

View File

@ -18,8 +18,28 @@ package errors
import "fmt"
// ErrCorruptCheckpoint error is reported when checksum does not match
var ErrCorruptCheckpoint = fmt.Errorf("checkpoint is corrupted")
// ErrCorruptCheckpoint error is reported when checksum does not match.
// Check for it with:
//
// var csErr *CorruptCheckpointError
// if errors.As(err, &csErr) { ... }
// if errors.Is(err, CorruptCheckpointError{}) { ... }
type CorruptCheckpointError struct {
ActualCS, ExpectedCS uint64
}
func (err CorruptCheckpointError) Error() string {
return "checkpoint is corrupted"
}
func (err CorruptCheckpointError) Is(target error) bool {
switch target.(type) {
case *CorruptCheckpointError, CorruptCheckpointError:
return true
default:
return false
}
}
// ErrCheckpointNotFound is reported when checkpoint is not found for a given key
var ErrCheckpointNotFound = fmt.Errorf("checkpoint is not found")

View File

@ -110,8 +110,12 @@ func (cp *CPUManagerCheckpointV1) VerifyChecksum() error {
hash := fnv.New32a()
fmt.Fprintf(hash, "%v", object)
if cp.Checksum != checksum.Checksum(hash.Sum32()) {
return errors.ErrCorruptCheckpoint
actualCS := checksum.Checksum(hash.Sum32())
if cp.Checksum != actualCS {
return &errors.CorruptCheckpointError{
ActualCS: uint64(actualCS),
ExpectedCS: uint64(cp.Checksum),
}
}
return nil

View File

@ -18,6 +18,7 @@ package state
import (
"encoding/json"
"errors"
"fmt"
"hash/fnv"
"strings"
@ -25,7 +26,7 @@ import (
"k8s.io/apimachinery/pkg/util/dump"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
var _ checkpointmanager.Checkpoint = &DRAManagerCheckpoint{}
@ -79,7 +80,7 @@ func (dc *DRAManagerCheckpoint) VerifyChecksum() error {
ck := dc.Checksum
dc.Checksum = 0
err := ck.Verify(dc)
if err == errors.ErrCorruptCheckpoint {
if errors.Is(err, cmerrors.CorruptCheckpointError{}) {
// Verify with old structs without ResourceHandles field
// TODO: remove in Beta
err = verifyChecksumWithoutResourceHandles(dc, ck)
@ -115,8 +116,12 @@ func verifyChecksumWithoutResourceHandles(dc *DRAManagerCheckpoint, checkSum che
object = strings.Replace(object, "ClaimInfoStateListWithoutResourceHandles", "ClaimInfoStateList", 1)
hash := fnv.New32a()
fmt.Fprintf(hash, "%v", object)
if checkSum != checksum.Checksum(hash.Sum32()) {
return errors.ErrCorruptCheckpoint
actualCS := checksum.Checksum(hash.Sum32())
if checkSum != actualCS {
return &cmerrors.CorruptCheckpointError{
ActualCS: uint64(actualCS),
ExpectedCS: uint64(checkSum),
}
}
return nil
}

View File

@ -126,7 +126,7 @@ func (sc *stateCheckpoint) GetOrCreate() (ClaimInfoStateList, error) {
return ClaimInfoStateList{}, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get checkpoint %v: %v", sc.checkpointName, err)
return nil, fmt.Errorf("failed to get checkpoint %v: %w", sc.checkpointName, err)
}
return checkpoint.Entries, nil

View File

@ -17,16 +17,19 @@ limitations under the License.
package state
import (
"errors"
"os"
"path"
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
cmerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
testutil "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/testing"
)
@ -198,7 +201,7 @@ func TestCheckpointGetOrCreate(t *testing.T) {
assert.Error(t, err)
assert.Contains(t, err.Error(), tc.expectedError)
} else {
assert.NoError(t, err, "unexpected error while creating checkpointState")
requireNoCheckpointError(t, err)
// compare state after restoration with the one expected
assertStateEqual(t, state, tc.expectedState)
}
@ -270,7 +273,7 @@ func TestOldCheckpointRestore(t *testing.T) {
checkpoint := NewDRAManagerCheckpoint()
err = cpm.GetCheckpoint(testingCheckpoint, checkpoint)
assert.NoError(t, err, "could not restore checkpoint")
requireNoCheckpointError(t, err)
checkpointData, err := checkpoint.MarshalCheckpoint()
assert.NoError(t, err, "could not Marshal Checkpoint")
@ -278,3 +281,13 @@ func TestOldCheckpointRestore(t *testing.T) {
expectedData := `{"version":"v1","entries":[{"DriverName":"test-driver.cdi.k8s.io","ClassName":"class-name","ClaimUID":"067798be-454e-4be4-9047-1aa06aea63f7","ClaimName":"example","Namespace":"default","PodUIDs":{"139cdb46-f989-4f17-9561-ca10cfb509a6":{}},"ResourceHandles":null,"CDIDevices":{"test-driver.cdi.k8s.io":["example.com/example=cdi-example"]}}],"checksum":453625682}`
assert.Equal(t, expectedData, string(checkpointData), "expected ClaimInfoState does not equal to restored one")
}
func requireNoCheckpointError(t *testing.T, err error) {
t.Helper()
var cksumErr *cmerrors.CorruptCheckpointError
if errors.As(err, &cksumErr) {
t.Fatalf("unexpected corrupt checkpoint, expected checksum %d, got %d", cksumErr.ExpectedCS, cksumErr.ActualCS)
} else {
require.NoError(t, err, "could not restore checkpoint")
}
}