mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-04 09:49:50 +00:00
retry StopPodSandbox on Network teardown failure
This commit is contained in:
parent
012acad32e
commit
3cc837878f
@ -115,8 +115,10 @@ func (ds *dockerService) RunPodSandbox(config *runtimeapi.PodSandboxConfig) (str
|
|||||||
// after us?
|
// after us?
|
||||||
func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
|
func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
|
||||||
var namespace, name string
|
var namespace, name string
|
||||||
|
var checkpointErr, statusErr error
|
||||||
needNetworkTearDown := false
|
needNetworkTearDown := false
|
||||||
|
|
||||||
|
// Try to retrieve sandbox information from docker daemon or sandbox checkpoint
|
||||||
status, statusErr := ds.PodSandboxStatus(podSandboxID)
|
status, statusErr := ds.PodSandboxStatus(podSandboxID)
|
||||||
if statusErr == nil {
|
if statusErr == nil {
|
||||||
nsOpts := status.GetLinux().GetNamespaces().GetOptions()
|
nsOpts := status.GetLinux().GetNamespaces().GetOptions()
|
||||||
@ -125,36 +127,53 @@ func (ds *dockerService) StopPodSandbox(podSandboxID string) error {
|
|||||||
namespace = m.Namespace
|
namespace = m.Namespace
|
||||||
name = m.Name
|
name = m.Name
|
||||||
} else {
|
} else {
|
||||||
checkpoint, err := ds.checkpointHandler.GetCheckpoint(podSandboxID)
|
var checkpoint *PodSandboxCheckpoint
|
||||||
if err != nil {
|
checkpoint, checkpointErr = ds.checkpointHandler.GetCheckpoint(podSandboxID)
|
||||||
glog.Errorf("Failed to get checkpoint for sandbox %q: %v", podSandboxID, err)
|
|
||||||
return fmt.Errorf("failed to get sandbox status: %v", statusErr)
|
// Proceed if both sandbox container and checkpoint could not be found. This means that following
|
||||||
|
// actions will only have sandbox ID and not have pod namespace and name information.
|
||||||
|
// Return error if encounter any unexpected error.
|
||||||
|
if checkpointErr != nil {
|
||||||
|
if dockertools.IsContainerNotFoundError(statusErr) && checkpointErr == errors.CheckpointNotFoundError {
|
||||||
|
glog.Warningf("Both sandbox container and checkpoint for id %q could not be found. "+
|
||||||
|
"Proceed without further sandbox information.", podSandboxID)
|
||||||
|
} else {
|
||||||
|
return utilerrors.NewAggregate([]error{
|
||||||
|
fmt.Errorf("failed to get checkpoint for sandbox %q: %v", podSandboxID, checkpointErr),
|
||||||
|
fmt.Errorf("failed to get sandbox status: %v", statusErr)})
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
namespace = checkpoint.Namespace
|
namespace = checkpoint.Namespace
|
||||||
name = checkpoint.Name
|
name = checkpoint.Name
|
||||||
|
}
|
||||||
|
|
||||||
// Always trigger network plugin to tear down
|
// Always trigger network plugin to tear down
|
||||||
needNetworkTearDown = true
|
needNetworkTearDown = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WARNING: The following operations made the following assumption:
|
||||||
|
// 1. kubelet will retry on any error returned by StopPodSandbox.
|
||||||
|
// 2. tearing down network and stopping sandbox container can succeed in any sequence.
|
||||||
|
// This depends on the implementation detail of network plugin and proper error handling.
|
||||||
|
// For kubenet, if tearing down network failed and sandbox container is stopped, kubelet
|
||||||
|
// will retry. On retry, kubenet will not be able to retrieve network namespace of the sandbox
|
||||||
|
// since it is stopped. With empty network namespcae, CNI bridge plugin will conduct best
|
||||||
|
// effort clean up and will not return error.
|
||||||
|
errList := []error{}
|
||||||
if needNetworkTearDown {
|
if needNetworkTearDown {
|
||||||
cID := kubecontainer.BuildContainerID(runtimeName, podSandboxID)
|
cID := kubecontainer.BuildContainerID(runtimeName, podSandboxID)
|
||||||
if err := ds.networkPlugin.TearDownPod(namespace, name, cID); err != nil {
|
if err := ds.networkPlugin.TearDownPod(namespace, name, cID); err != nil {
|
||||||
// TODO: Figure out a way to retry this error. We can't
|
errList = append(errList, fmt.Errorf("failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err))
|
||||||
// right now because the plugin throws errors when it doesn't find
|
|
||||||
// eth0, which might not exist for various reasons (setup failed,
|
|
||||||
// conf changed etc). In theory, it should teardown everything else
|
|
||||||
// so there's no need to retry.
|
|
||||||
glog.Errorf("Failed to teardown sandbox %q for pod %s/%s: %v", podSandboxID, namespace, name, err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil {
|
if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil {
|
||||||
glog.Errorf("Failed to stop sandbox %q: %v", podSandboxID, err)
|
glog.Errorf("Failed to stop sandbox %q: %v", podSandboxID, err)
|
||||||
// Do not return error if the container does not exist
|
// Do not return error if the container does not exist
|
||||||
if !dockertools.IsContainerNotFoundError(err) {
|
if !dockertools.IsContainerNotFoundError(err) {
|
||||||
return err
|
errList = append(errList, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return utilerrors.NewAggregate(errList)
|
||||||
// TODO: Stop all running containers in the sandbox.
|
// TODO: Stop all running containers in the sandbox.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user