From 5faf8f4c528b977d294e09cd589fecda47b8631b Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sat, 28 Dec 2019 13:44:24 +0100 Subject: [PATCH 1/2] Lock checksum calculation for v1 CPUManager state to pre 1.18 logic The updated CPUManager from PR #84462 implements logic to migrate the CPUManager checkpoint file from an old format to a new one. To do so, it defines the following types: ``` type CPUManagerCheckpoint = CPUManagerCheckpointV2 type CPUManagerCheckpointV1 struct { ... } type CPUManagerCheckpointV2 struct { ... } ``` This replaces the old definition of just: ``` type CPUManagerCheckpoint struct { ... } ``` Code was put in place to ensure proper migration from checkpoints in V1 format to checkpoints in V2 format. However (and this is a big however), all of the unit tests were performed on V1 checkpoints that were generated using the type name `CPUManagerCheckpointV1` and not the original type name of `CPUManagerCheckpoint`. As such, the checksum in the checkpoint file uses the `CPUManagerCheckpointV1` type to calculate its checksum and not the original type name of `CPUManagerCheckpoint`. This causes problems in the real world since all pre-1.18 checkpoint files will have been generated with the original type name of `CPUManagerCheckpoint`. When verifying the checksum of the checkpoint file across an upgrade to 1.18, the checksum is calculated assuming a type name of `CPUManagerCheckpointV1` (which is incorrect) and the file is seen to be corrupt. This patch ensures that all V1 checksums are verified against a type name of `CPUManagerCheckpoint` instead of ``CPUManagerCheckpointV1`. It also locks the algorithm used to calculate the checksum in place, since it wil never change in the future (for pre-1.18 checkpoint files at least). --- pkg/kubelet/cm/cpumanager/state/BUILD | 1 + pkg/kubelet/cm/cpumanager/state/checkpoint.go | 25 +++++++++++++++++-- .../cpumanager/state/state_checkpoint_test.go | 16 +++++++++++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/state/BUILD b/pkg/kubelet/cm/cpumanager/state/BUILD index 37850de90c3..438737c9d18 100644 --- a/pkg/kubelet/cm/cpumanager/state/BUILD +++ b/pkg/kubelet/cm/cpumanager/state/BUILD @@ -17,6 +17,7 @@ go_library( "//pkg/kubelet/checkpointmanager/errors:go_default_library", "//pkg/kubelet/cm/cpumanager/containermap:go_default_library", "//pkg/kubelet/cm/cpuset:go_default_library", + "//vendor/github.com/davecgh/go-spew/spew:go_default_library", "//vendor/k8s.io/klog:go_default_library", ], ) diff --git a/pkg/kubelet/cm/cpumanager/state/checkpoint.go b/pkg/kubelet/cm/cpumanager/state/checkpoint.go index 3683e66aedc..f817de24a7e 100644 --- a/pkg/kubelet/cm/cpumanager/state/checkpoint.go +++ b/pkg/kubelet/cm/cpumanager/state/checkpoint.go @@ -18,9 +18,14 @@ package state import ( "encoding/json" + "hash/fnv" + "strings" + + "github.com/davecgh/go-spew/spew" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum" + "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ) // CPUManagerCheckpoint struct is used to store cpu/pod assignments in a checkpoint @@ -95,11 +100,27 @@ func (cp *CPUManagerCheckpointV1) VerifyChecksum() error { // accept empty checksum for compatibility with old file backend return nil } + + printer := spew.ConfigState{ + Indent: " ", + SortKeys: true, + DisableMethods: true, + SpewKeys: true, + } + ck := cp.Checksum cp.Checksum = 0 - err := ck.Verify(cp) + object := printer.Sprintf("%#v", cp) + object = strings.Replace(object, "CPUManagerCheckpointV1", "CPUManagerCheckpoint", 1) cp.Checksum = ck - return err + + hash := fnv.New32a() + printer.Fprintf(hash, "%v", object) + if cp.Checksum != checksum.Checksum(hash.Sum32()) { + return errors.ErrCorruptCheckpoint + } + + return nil } // VerifyChecksum verifies that current checksum of checkpoint is valid in v2 format diff --git a/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go b/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go index 34d40c43174..e81eaddd75a 100644 --- a/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go +++ b/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go @@ -154,6 +154,20 @@ func TestCheckpointStateRestore(t *testing.T) { `could not parse cpuset "asd" for container "container2" in pod "pod": strconv.Atoi: parsing "asd": invalid syntax`, &stateMemory{}, }, + { + "Restore checkpoint from checkpoint with v1 checksum", + `{ + "policyName": "none", + "defaultCPUSet": "1-3", + "checksum": 1694838852 + }`, + "none", + containermap.ContainerMap{}, + "", + &stateMemory{ + defaultCPUSet: cpuset.NewCPUSet(1, 2, 3), + }, + }, { "Restore checkpoint with migration", `{ @@ -163,7 +177,7 @@ func TestCheckpointStateRestore(t *testing.T) { "containerID1": "4-6", "containerID2": "1-3" }, - "checksum": 2832947348 + "checksum": 3680390589 }`, "none", func() containermap.ContainerMap { From b373121a146528913d90137184ebec8f608e65e3 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sat, 28 Dec 2019 19:19:59 +0100 Subject: [PATCH 2/2] Make CPUManagerCheckpointV2 type an alias of CPUManagerCheckpoint This change is to prevent problems when we remove the V1->V2 migration code in the future. Without this, the checksums of all checkpoints would be hashed with the name CPUManagerCheckpointV2 embedded inside of them, which is undesirable. We want the checkpoints to be hashed with the name CPUManagerCheckpoint instead. --- pkg/kubelet/cm/cpumanager/state/checkpoint.go | 19 ++++++++++--------- .../cpumanager/state/state_checkpoint_test.go | 10 +++++----- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/state/checkpoint.go b/pkg/kubelet/cm/cpumanager/state/checkpoint.go index f817de24a7e..e7df594efc5 100644 --- a/pkg/kubelet/cm/cpumanager/state/checkpoint.go +++ b/pkg/kubelet/cm/cpumanager/state/checkpoint.go @@ -28,11 +28,17 @@ import ( "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors" ) -// CPUManagerCheckpoint struct is used to store cpu/pod assignments in a checkpoint -type CPUManagerCheckpoint = CPUManagerCheckpointV2 - var _ checkpointmanager.Checkpoint = &CPUManagerCheckpointV1{} var _ checkpointmanager.Checkpoint = &CPUManagerCheckpointV2{} +var _ checkpointmanager.Checkpoint = &CPUManagerCheckpoint{} + +// CPUManagerCheckpoint struct is used to store cpu/pod assignments in a checkpoint in v2 format +type CPUManagerCheckpoint struct { + PolicyName string `json:"policyName"` + DefaultCPUSet string `json:"defaultCpuSet"` + Entries map[string]map[string]string `json:"entries,omitempty"` + Checksum checksum.Checksum `json:"checksum"` +} // CPUManagerCheckpointV1 struct is used to store cpu/pod assignments in a checkpoint in v1 format type CPUManagerCheckpointV1 struct { @@ -43,12 +49,7 @@ type CPUManagerCheckpointV1 struct { } // CPUManagerCheckpointV2 struct is used to store cpu/pod assignments in a checkpoint in v2 format -type CPUManagerCheckpointV2 struct { - PolicyName string `json:"policyName"` - DefaultCPUSet string `json:"defaultCpuSet"` - Entries map[string]map[string]string `json:"entries,omitempty"` - Checksum checksum.Checksum `json:"checksum"` -} +type CPUManagerCheckpointV2 = CPUManagerCheckpoint // NewCPUManagerCheckpoint returns an instance of Checkpoint func NewCPUManagerCheckpoint() *CPUManagerCheckpoint { diff --git a/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go b/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go index e81eaddd75a..fc96da2931e 100644 --- a/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go +++ b/pkg/kubelet/cm/cpumanager/state/state_checkpoint_test.go @@ -54,7 +54,7 @@ func TestCheckpointStateRestore(t *testing.T) { "policyName": "none", "defaultCPUSet": "4-6", "entries": {}, - "checksum": 2655485041 + "checksum": 354655845 }`, "none", containermap.ContainerMap{}, @@ -74,7 +74,7 @@ func TestCheckpointStateRestore(t *testing.T) { "container2": "1-3" } }, - "checksum": 3415933391 + "checksum": 3610638499 }`, "none", containermap.ContainerMap{}, @@ -116,7 +116,7 @@ func TestCheckpointStateRestore(t *testing.T) { "policyName": "other", "defaultCPUSet": "1-3", "entries": {}, - "checksum": 698611581 + "checksum": 1394507217 }`, "none", containermap.ContainerMap{}, @@ -129,7 +129,7 @@ func TestCheckpointStateRestore(t *testing.T) { "policyName": "none", "defaultCPUSet": "1.3", "entries": {}, - "checksum": 1966990140 + "checksum": 3021697696 }`, "none", containermap.ContainerMap{}, @@ -147,7 +147,7 @@ func TestCheckpointStateRestore(t *testing.T) { "container2": "asd" } }, - "checksum": 3082925826 + "checksum": 962272150 }`, "none", containermap.ContainerMap{},