mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 15:05:27 +00:00
Merge pull request #41434 from freehan/cri-kubenet-error
Automatic merge from submit-queue (batch tested with PRs 41531, 40417, 41434) [CRI] beef up network teardown in StopPodSandbox 1. Added CheckpointNotFound error to allow dockershim to conduct error handling 2. Retry network teardown if failed ref: https://github.com/kubernetes/kubernetes/issues/41225
This commit is contained in:
commit
11bf535e03
@ -34,6 +34,7 @@ go_library(
|
||||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/container:go_default_library",
|
||||
"//pkg/kubelet/dockershim/cm:go_default_library",
|
||||
"//pkg/kubelet/dockershim/errors:go_default_library",
|
||||
"//pkg/kubelet/dockertools:go_default_library",
|
||||
"//pkg/kubelet/dockertools/securitycontext:go_default_library",
|
||||
"//pkg/kubelet/leaky:go_default_library",
|
||||
@ -83,6 +84,7 @@ go_test(
|
||||
"//pkg/kubelet/api/v1alpha1/runtime:go_default_library",
|
||||
"//pkg/kubelet/container:go_default_library",
|
||||
"//pkg/kubelet/container/testing:go_default_library",
|
||||
"//pkg/kubelet/dockershim/errors:go_default_library",
|
||||
"//pkg/kubelet/dockershim/testing:go_default_library",
|
||||
"//pkg/kubelet/dockertools:go_default_library",
|
||||
"//pkg/kubelet/dockertools/securitycontext:go_default_library",
|
||||
@ -113,6 +115,7 @@ filegroup(
|
||||
srcs = [
|
||||
":package-srcs",
|
||||
"//pkg/kubelet/dockershim/cm:all-srcs",
|
||||
"//pkg/kubelet/dockershim/errors:all-srcs",
|
||||
"//pkg/kubelet/dockershim/remote:all-srcs",
|
||||
"//pkg/kubelet/dockershim/testing:all-srcs",
|
||||
],
|
||||
|
@ -23,6 +23,8 @@ import (
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -40,6 +42,7 @@ type CheckpointStore interface {
|
||||
// Write persists a checkpoint with key
|
||||
Write(key string, data []byte) error
|
||||
// Read retrieves a checkpoint with key
|
||||
// Read must return CheckpointNotFoundError if checkpoint is not found
|
||||
Read(key string) ([]byte, error)
|
||||
// Delete deletes a checkpoint with key
|
||||
// Delete must not return error if checkpoint does not exist
|
||||
@ -75,7 +78,11 @@ func (fstore *FileStore) Read(key string) ([]byte, error) {
|
||||
if err := validateKey(key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ioutil.ReadFile(fstore.getCheckpointPath(key))
|
||||
bytes, err := ioutil.ReadFile(fstore.getCheckpointPath(key))
|
||||
if os.IsNotExist(err) {
|
||||
return bytes, errors.CheckpointNotFoundError
|
||||
}
|
||||
return bytes, err
|
||||
}
|
||||
|
||||
func (fstore *FileStore) Delete(key string) error {
|
||||
|
@ -23,6 +23,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
)
|
||||
|
||||
func TestFileStore(t *testing.T) {
|
||||
@ -102,7 +103,7 @@ func TestFileStore(t *testing.T) {
|
||||
err = store.Delete(c.key)
|
||||
assert.NoError(t, err)
|
||||
_, err = store.Read(c.key)
|
||||
assert.Error(t, err)
|
||||
assert.EqualValues(t, errors.CheckpointNotFoundError, err)
|
||||
}
|
||||
|
||||
// Test delete non existed checkpoint
|
||||
|
@ -23,6 +23,7 @@ import (
|
||||
"path/filepath"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
hashutil "k8s.io/kubernetes/pkg/util/hash"
|
||||
)
|
||||
|
||||
@ -34,8 +35,6 @@ const (
|
||||
schemaVersion = "v1"
|
||||
)
|
||||
|
||||
var CorruptCheckpointError = fmt.Errorf("checkpoint is corrupted.")
|
||||
|
||||
type Protocol string
|
||||
|
||||
// PortMapping is the port mapping configurations of a sandbox.
|
||||
@ -108,11 +107,11 @@ func (handler *PersistentCheckpointHandler) GetCheckpoint(podSandboxID string) (
|
||||
err = json.Unmarshal(blob, &checkpoint)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to unmarshal checkpoint %q. Checkpoint content: %q. ErrMsg: %v", podSandboxID, string(blob), err)
|
||||
return &checkpoint, CorruptCheckpointError
|
||||
return &checkpoint, errors.CorruptCheckpointError
|
||||
}
|
||||
if checkpoint.CheckSum != calculateChecksum(checkpoint) {
|
||||
glog.Errorf("Checksum of checkpoint %q is not valid", podSandboxID)
|
||||
return &checkpoint, CorruptCheckpointError
|
||||
return &checkpoint, errors.CorruptCheckpointError
|
||||
}
|
||||
return &checkpoint, nil
|
||||
}
|
||||
|
@ -27,6 +27,7 @@ import (
|
||||
utilerrors "k8s.io/apimachinery/pkg/util/errors"
|
||||
runtimeapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockertools"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
"k8s.io/kubernetes/pkg/kubelet/types"
|
||||
@ -114,8 +115,10 @@ func (ds *dockerService) RunPodSandbox(config *runtimeapi.PodSandboxConfig) (str
|
||||
// after us?
|
||||
func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
|
||||
var namespace, name string
|
||||
var checkpointErr, statusErr error
|
||||
needNetworkTearDown := false
|
||||
|
||||
// Try to retrieve sandbox information from docker daemon or sandbox checkpoint
|
||||
status, statusErr := ds.PodSandboxStatus(podSandboxID)
|
||||
if statusErr == nil {
|
||||
nsOpts := status.GetLinux().GetNamespaces().GetOptions()
|
||||
@ -124,36 +127,53 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
|
||||
namespace = m.Namespace
|
||||
name = m.Name
|
||||
} else {
|
||||
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to get checkpoint for sandbox %q: %v", podSandboxID, err)
|
||||
return fmt.Errorf("failed to get sandbox status: %v", statusErr)
|
||||
var checkpoint *PodSandboxCheckpoint
|
||||
checkpoint, checkpointErr = ds.checkpointHandler.GetCheckpoint(podSandboxID)
|
||||
|
||||
// Proceed if both sandbox container and checkpoint could not be found. This means that following
|
||||
// actions will only have sandbox ID and not have pod namespace and name information.
|
||||
// Return error if encounter any unexpected error.
|
||||
if checkpointErr != nil {
|
||||
if dockertools.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
|
||||
glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
|
||||
"Proceed without further sandbox information.", podSandboxID)
|
||||
} else {
|
||||
return utilerrors.NewAggregate([]error{
|
||||
fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
|
||||
fmt.Errorf("failed to get sandbox status: %v", statusErr)})
|
||||
}
|
||||
} else {
|
||||
namespace = checkpoint.Namespace
|
||||
name = checkpoint.Name
|
||||
}
|
||||
namespace = checkpoint.Namespace
|
||||
name = checkpoint.Name
|
||||
|
||||
// Always trigger network plugin to tear down
|
||||
needNetworkTearDown = true
|
||||
}
|
||||
|
||||
// WARNING: The following operations made the following assumption:
|
||||
// 1. kubelet will retry on any error returned by StopPodSandbox.
|
||||
// 2. tearing down network and stopping sandbox container can succeed in any sequence.
|
||||
// This depends on the implementation detail of network plugin and proper error handling.
|
||||
// For kubenet, if tearing down network failed and sandbox container is stopped, kubelet
|
||||
// will retry. On retry, kubenet will not be able to retrieve network namespace of the sandbox
|
||||
// since it is stopped. With empty network namespcae, CNI bridge plugin will conduct best
|
||||
// effort clean up and will not return error.
|
||||
errList := []error{}
|
||||
if needNetworkTearDown {
|
||||
cID := kubecontainer.BuildContainerID(runtimeName, podSandboxID)
|
||||
if err := ds.networkPlugin.TearDownPod(namespace, name, cID); err != nil {
|
||||
// TODO: Figure out a way to retry this error. We can't
|
||||
// right now because the plugin throws errors when it doesn't find
|
||||
// eth0, which might not exist for various reasons (setup failed,
|
||||
// conf changed etc). In theory, it should teardown everything else
|
||||
// so there's no need to retry.
|
||||
glog.Errorf("Failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err)
|
||||
errList = append(errList, fmt.Errorf("failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err))
|
||||
}
|
||||
}
|
||||
if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil {
|
||||
glog.Errorf("Failed to stop sandbox %q: %v", podSandboxID, err)
|
||||
// Do not return error if the container does not exist
|
||||
if !dockertools.IsContainerNotFoundError(err) {
|
||||
return err
|
||||
errList = append(errList, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return utilerrors.NewAggregate(errList)
|
||||
// TODO: Stop all running containers in the sandbox.
|
||||
}
|
||||
|
||||
@ -357,7 +377,7 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to retrieve checkpoint for sandbox %q: %v", id, err)
|
||||
|
||||
if err == CorruptCheckpointError {
|
||||
if err == errors.CorruptCheckpointError {
|
||||
glog.V(2).Info("Removing corrupted checkpoint %q: %+v", id, *checkpoint)
|
||||
ds.checkpointHandler.RemoveCheckpoint(id)
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ import (
|
||||
kubecm "k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockertools"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network"
|
||||
"k8s.io/kubernetes/pkg/kubelet/network/cni"
|
||||
@ -292,8 +293,14 @@ func (ds *dockerService) GetNetNS(podSandboxID string) (string, error) {
|
||||
func (ds *dockerService) GetPodPortMappings(podSandboxID string) ([]*hostport.PortMapping, error) {
|
||||
// TODO: get portmappings from docker labels for backward compatibility
|
||||
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
|
||||
// Return empty portMappings if checkpoint is not found
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if err == errors.CheckpointNotFoundError {
|
||||
glog.Warningf("Failed to retrieve checkpoint for sandbox %q: %v", err)
|
||||
return nil, nil
|
||||
} else {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
portMappings := []*hostport.PortMapping{}
|
||||
|
27
pkg/kubelet/dockershim/errors/BUILD
Normal file
27
pkg/kubelet/dockershim/errors/BUILD
Normal file
@ -0,0 +1,27 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
load(
|
||||
"@io_bazel_rules_go//go:def.bzl",
|
||||
"go_library",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = ["errors.go"],
|
||||
tags = ["automanaged"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
tags = ["automanaged"],
|
||||
visibility = ["//visibility:private"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all-srcs",
|
||||
srcs = [":package-srcs"],
|
||||
tags = ["automanaged"],
|
||||
)
|
22
pkg/kubelet/dockershim/errors/errors.go
Normal file
22
pkg/kubelet/dockershim/errors/errors.go
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package errors
|
||||
|
||||
import "fmt"
|
||||
|
||||
var CorruptCheckpointError = fmt.Errorf("checkpoint is corrupted.")
|
||||
var CheckpointNotFoundError = fmt.Errorf("checkpoint is not found.")
|
@ -11,6 +11,7 @@ go_library(
|
||||
name = "go_default_library",
|
||||
srcs = ["util.go"],
|
||||
tags = ["automanaged"],
|
||||
deps = ["//pkg/kubelet/dockershim/errors:go_default_library"],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
|
@ -17,8 +17,9 @@ limitations under the License.
|
||||
package testing
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/errors"
|
||||
)
|
||||
|
||||
// MemStore is an implementation of CheckpointStore interface which stores checkpoint in memory.
|
||||
@ -43,7 +44,7 @@ func (mstore *MemStore) Read(key string) ([]byte, error) {
|
||||
defer mstore.Unlock()
|
||||
data, ok := mstore.mem[key]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("checkpoint %q could not be found", key)
|
||||
return nil, errors.CheckpointNotFoundError
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user