1
0
mirror of https://github.com/rancher/rke.git synced 2025-05-06 15:27:56 +00:00

Add retry logic to Docker interactions

This commit is contained in:
Sebastiaan van Steenis 2019-06-11 11:39:45 +02:00 committed by Alena Prokharchyk
parent 60d36022f7
commit 173669c8a5
3 changed files with 186 additions and 91 deletions
docker
pki
services

View File

@ -31,6 +31,8 @@ const (
RestartTimeout = 5
// StopTimeout in seconds
StopTimeout = 5
// RetryCount is the amount of retries for Docker operations
RetryCount = 3
)
var K8sDockerVersions = map[string][]string{
@ -54,19 +56,19 @@ func DoRunContainer(ctx context.Context, dClient *client.Client, imageCfg *conta
if dClient == nil {
return fmt.Errorf("[%s] Failed to run container: docker client is nil for container [%s] on host [%s]", plane, containerName, hostname)
}
container, err := dClient.ContainerInspect(ctx, containerName)
container, err := InspectContainer(ctx, dClient, hostname, containerName)
if err != nil {
if !client.IsErrNotFound(err) {
return err
}
if err := UseLocalOrPull(ctx, dClient, hostname, imageCfg.Image, plane, prsMap); err != nil {
return err
return fmt.Errorf("Failed to pull image [%s] on host [%s]: %v", imageCfg.Image, hostname, err)
}
resp, err := dClient.ContainerCreate(ctx, imageCfg, hostCfg, nil, containerName)
_, err := CreateContainer(ctx, dClient, hostname, containerName, imageCfg, hostCfg)
if err != nil {
return fmt.Errorf("Failed to create [%s] container on host [%s]: %v", containerName, hostname, err)
}
if err := dClient.ContainerStart(ctx, resp.ID, types.ContainerStartOptions{}); err != nil {
if err := StartContainer(ctx, dClient, hostname, containerName); err != nil {
return fmt.Errorf("Failed to start [%s] container on host [%s]: %v", containerName, hostname, err)
}
log.Infof(ctx, "[%s] Successfully started [%s] container on host [%s]", plane, containerName, hostname)
@ -77,9 +79,9 @@ func DoRunContainer(ctx context.Context, dClient *client.Client, imageCfg *conta
// check if container is in a restarting loop
if container.State.Restarting {
logrus.Debugf("[%s] Container [%s] is in a restarting loop [%s]", plane, containerName, hostname)
restartTimeoutDuration := RestartTimeout * time.Second
if err := dClient.ContainerRestart(ctx, container.ID, &restartTimeoutDuration); err != nil {
return fmt.Errorf("Failed to start [%s] container on host [%s]: %v", containerName, hostname, err)
err = RestartContainer(ctx, dClient, hostname, containerName)
if err != nil {
return err
}
}
logrus.Debugf("[%s] Container [%s] is already running on host [%s]", plane, containerName, hostname)
@ -94,8 +96,8 @@ func DoRunContainer(ctx context.Context, dClient *client.Client, imageCfg *conta
}
// start if not running
logrus.Debugf("[%s] Starting stopped container [%s] on host [%s]", plane, containerName, hostname)
if err := dClient.ContainerStart(ctx, container.ID, types.ContainerStartOptions{}); err != nil {
logrus.Infof("[%s] Starting stopped container [%s] on host [%s]", plane, containerName, hostname)
if err := StartContainer(ctx, dClient, hostname, containerName); err != nil {
return fmt.Errorf("Failed to start [%s] container on host [%s]: %v", containerName, hostname, err)
}
log.Infof(ctx, "[%s] Successfully started [%s] container on host [%s]", plane, containerName, hostname)
@ -106,7 +108,7 @@ func DoRunOnetimeContainer(ctx context.Context, dClient *client.Client, imageCfg
if dClient == nil {
return fmt.Errorf("[%s] Failed to run container: docker client is nil for container [%s] on host [%s]", plane, containerName, hostname)
}
_, err := dClient.ContainerInspect(ctx, containerName)
_, err := InspectContainer(ctx, dClient, hostname, containerName)
if err != nil {
if !client.IsErrNotFound(err) {
return err
@ -114,11 +116,11 @@ func DoRunOnetimeContainer(ctx context.Context, dClient *client.Client, imageCfg
if err := UseLocalOrPull(ctx, dClient, hostname, imageCfg.Image, plane, prsMap); err != nil {
return err
}
resp, err := dClient.ContainerCreate(ctx, imageCfg, hostCfg, nil, containerName)
_, err := CreateContainer(ctx, dClient, hostname, containerName, imageCfg, hostCfg)
if err != nil {
return fmt.Errorf("Failed to create [%s] container on host [%s]: %v", containerName, hostname, err)
}
if err := dClient.ContainerStart(ctx, resp.ID, types.ContainerStartOptions{}); err != nil {
if err := StartContainer(ctx, dClient, hostname, containerName); err != nil {
return fmt.Errorf("Failed to start [%s] container on host [%s]: %v", containerName, hostname, err)
}
log.Infof(ctx, "Successfully started [%s] container on host [%s]", containerName, hostname)
@ -182,8 +184,7 @@ func DoRemoveContainer(ctx context.Context, dClient *client.Client, containerNam
return fmt.Errorf("Failed to remove container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
logrus.Debugf("[remove/%s] Checking if container is running on host [%s]", containerName, hostname)
// not using the wrapper to check if the error is a NotFound error
_, err := dClient.ContainerInspect(ctx, containerName)
_, err := InspectContainer(ctx, dClient, hostname, containerName)
if err != nil {
if client.IsErrNotFound(err) {
logrus.Debugf("[remove/%s] Container doesn't exist on host [%s]", containerName, hostname)
@ -204,11 +205,19 @@ func IsContainerRunning(ctx context.Context, dClient *client.Client, hostname st
if dClient == nil {
return false, fmt.Errorf("Failed to check if container is running: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
logrus.Debugf("Checking if container [%s] is running on host [%s]", containerName, hostname)
containers, err := dClient.ContainerList(ctx, types.ContainerListOptions{All: all})
var containers []types.Container
var err error
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Checking if container [%s] is running on host [%s], try #%d", containerName, hostname, i)
containers, err = dClient.ContainerList(ctx, types.ContainerListOptions{All: all})
if err != nil {
logrus.Warnf("Error checking if container [%s] is running on host [%s]: %v", containerName, hostname, err)
continue
}
break
}
if err != nil {
return false, fmt.Errorf("Can't get Docker containers for host [%s]: %v", hostname, err)
return false, fmt.Errorf("Error checking if container [%s] is running on host [%s]: %v", containerName, hostname, err)
}
for _, container := range containers {
if len(container.Names) != 0 && container.Names[0] == "/"+containerName {
@ -219,20 +228,27 @@ func IsContainerRunning(ctx context.Context, dClient *client.Client, hostname st
}
func localImageExists(ctx context.Context, dClient *client.Client, hostname string, containerImage string) (bool, error) {
logrus.Debugf("Checking if image [%s] exists on host [%s]", containerImage, hostname)
_, _, err := dClient.ImageInspectWithRaw(ctx, containerImage)
if err != nil {
if client.IsErrNotFound(err) {
logrus.Debugf("Image [%s] does not exist on host [%s]: %v", containerImage, hostname, err)
return false, nil
var err error
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Checking if image [%s] exists on host [%s], try #%d", containerImage, hostname, i)
_, _, err = dClient.ImageInspectWithRaw(ctx, containerImage)
if err != nil {
if client.IsErrNotFound(err) {
logrus.Infof("Image [%s] does not exist on host [%s]: %v", containerImage, hostname, err)
return false, nil
}
logrus.Warnf("Error checking if image [%s] exists on host [%s]: %v", containerImage, hostname, err)
continue
}
return false, fmt.Errorf("Error checking if image [%s] exists on host [%s]: %v", containerImage, hostname, err)
logrus.Infof("Image [%s] exists on host [%s]", containerImage, hostname)
return true, nil
}
logrus.Debugf("Image [%s] exists on host [%s]", containerImage, hostname)
return true, nil
return false, fmt.Errorf("Error checking if image [%s] exists on host [%s]: %v", containerImage, hostname, err)
}
func pullImage(ctx context.Context, dClient *client.Client, hostname string, containerImage string, prsMap map[string]v3.PrivateRegistry) error {
var out io.ReadCloser
var err error
pullOptions := types.ImagePullOptions{}
regAuth, prURL, err := GetImageRegistryConfig(containerImage, prsMap)
@ -244,119 +260,190 @@ func pullImage(ctx context.Context, dClient *client.Client, hostname string, con
}
pullOptions.RegistryAuth = regAuth
out, err := dClient.ImagePull(ctx, containerImage, pullOptions)
if err != nil {
return fmt.Errorf("Can't pull Docker image [%s] for host [%s]: %v", containerImage, hostname, err)
// Retry up to RetryCount times to pull image
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Pulling image [%s] on host [%s], try #%d", containerImage, hostname, i)
out, err = dClient.ImagePull(ctx, containerImage, pullOptions)
if err != nil {
logrus.Warnf("Can't pull Docker image [%s] on host [%s]: %v", containerImage, hostname, err)
continue
}
defer out.Close()
if logrus.GetLevel() == logrus.DebugLevel {
io.Copy(os.Stdout, out)
} else {
io.Copy(ioutil.Discard, out)
}
return nil
}
defer out.Close()
if logrus.GetLevel() == logrus.DebugLevel {
io.Copy(os.Stdout, out)
} else {
io.Copy(ioutil.Discard, out)
}
return nil
// If the for loop does not return, return the error
return err
}
func UseLocalOrPull(ctx context.Context, dClient *client.Client, hostname string, containerImage string, plane string, prsMap map[string]v3.PrivateRegistry) error {
if dClient == nil {
return fmt.Errorf("[%s] Failed to use local image or pull: docker client is nil for container [%s] on host [%s]", plane, containerImage, hostname)
}
logrus.Debugf("[%s] Checking image [%s] on host [%s]", plane, containerImage, hostname)
imageExists, err := localImageExists(ctx, dClient, hostname, containerImage)
if err != nil {
return err
var imageExists bool
var err error
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
// Increasing wait time on retry, but not on the first two try
if i > 2 {
time.Sleep(time.Duration(i) * time.Second)
}
// Check for local image and if error, log and retry
imageExists, err = localImageExists(ctx, dClient, hostname, containerImage)
if err != nil {
logrus.Warnf("[%s] %v", plane, err)
continue
}
// Return if image exists to prevent pulling
if imageExists {
return nil
}
// Try pulling and if error, log and retry
err = pullImage(ctx, dClient, hostname, containerImage, prsMap)
if err != nil {
logrus.Warnf("[%s] Can't pull Docker image [%s] on host [%s]: %v", plane, containerImage, hostname, err)
continue
}
}
if imageExists {
logrus.Debugf("[%s] No pull necessary, image [%s] exists on host [%s]", plane, containerImage, hostname)
return nil
}
log.Infof(ctx, "[%s] Pulling image [%s] on host [%s]", plane, containerImage, hostname)
if err := pullImage(ctx, dClient, hostname, containerImage, prsMap); err != nil {
return err
}
log.Infof(ctx, "[%s] Successfully pulled image [%s] on host [%s]", plane, containerImage, hostname)
return nil
// If the for loop does not return, return the error
return err
}
func RemoveContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) error {
if dClient == nil {
return fmt.Errorf("Failed to remove container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
err := dClient.ContainerRemove(ctx, containerName, types.ContainerRemoveOptions{Force: true, RemoveVolumes: true})
if err != nil {
return fmt.Errorf("Can't remove Docker container [%s] for host [%s]: %v", containerName, hostname, err)
var err error
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Removing container [%s] on host [%s], try #%d", containerName, hostname, i)
err = dClient.ContainerRemove(ctx, containerName, types.ContainerRemoveOptions{Force: true, RemoveVolumes: true})
if err != nil {
logrus.Warningf("Can't remove Docker container [%s] for host [%s]: %v", containerName, hostname, err)
continue
}
return nil
}
return nil
return err
}
func RestartContainer(ctx context.Context, dClient *client.Client, hostname, containerName string) error {
if dClient == nil {
return fmt.Errorf("Failed to restart container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
var err error
restartTimeout := RestartTimeout * time.Second
err := dClient.ContainerRestart(ctx, containerName, &restartTimeout)
if err != nil {
return fmt.Errorf("Can't restart Docker container [%s] for host [%s]: %v", containerName, hostname, err)
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Restarting container [%s] on host [%s], try #%d", containerName, hostname, i)
err = dClient.ContainerRestart(ctx, containerName, &restartTimeout)
if err != nil {
logrus.Warningf("Can't restart Docker container [%s] for host [%s]: %v", containerName, hostname, err)
continue
}
return nil
}
return nil
return err
}
func StopContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) error {
if dClient == nil {
return fmt.Errorf("Failed to stop container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
var err error
// define the stop timeout
stopTimeoutDuration := StopTimeout * time.Second
logrus.Debugf("Stopping container [%s] on host [%s] with stopTimeoutDuration [%s]", containerName, hostname, stopTimeoutDuration)
err := dClient.ContainerStop(ctx, containerName, &stopTimeoutDuration)
if err != nil {
return fmt.Errorf("Can't stop Docker container [%s] for host [%s]: %v", containerName, hostname, err)
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Stopping container [%s] on host [%s] with stopTimeoutDuration [%s], try #%d", containerName, hostname, stopTimeoutDuration, i)
err := dClient.ContainerStop(ctx, containerName, &stopTimeoutDuration)
if err != nil {
logrus.Warningf("Can't stop Docker container [%s] for host [%s]: %v", containerName, hostname, err)
continue
}
return nil
}
return nil
return err
}
func RenameContainer(ctx context.Context, dClient *client.Client, hostname string, oldContainerName string, newContainerName string) error {
if dClient == nil {
return fmt.Errorf("Failed to rename container: docker client is nil for container [%s] on host [%s]", oldContainerName, hostname)
}
logrus.Debugf("Renaming container [%s] to [%s] on host [%s]", oldContainerName, newContainerName, hostname)
err := dClient.ContainerRename(ctx, oldContainerName, newContainerName)
if err != nil {
return fmt.Errorf("Can't rename Docker container [%s] for host [%s]: %v", oldContainerName, hostname, err)
var err error
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Renaming container [%s] to [%s] on host [%s], try #%d", oldContainerName, newContainerName, hostname, i)
err = dClient.ContainerRename(ctx, oldContainerName, newContainerName)
if err != nil {
logrus.Warningf("Can't rename Docker container [%s] to [%s] for host [%s]: %v", oldContainerName, newContainerName, hostname, err)
continue
}
return nil
}
return nil
return err
}
func StartContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) error {
if dClient == nil {
return fmt.Errorf("Failed to start container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
if err := dClient.ContainerStart(ctx, containerName, types.ContainerStartOptions{}); err != nil {
return fmt.Errorf("Failed to start [%s] container on host [%s]: %v", containerName, hostname, err)
var err error
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
logrus.Infof("Starting container [%s] on host [%s], try #%d", containerName, hostname, i)
err = dClient.ContainerStart(ctx, containerName, types.ContainerStartOptions{})
if err != nil {
logrus.Warningf("Can't start Docker container [%s] on host [%s]: %v", containerName, hostname, err)
continue
}
return nil
}
return nil
return err
}
func CreateContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string, imageCfg *container.Config, hostCfg *container.HostConfig) (container.ContainerCreateCreatedBody, error) {
if dClient == nil {
return container.ContainerCreateCreatedBody{}, fmt.Errorf("Failed to create container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
created, err := dClient.ContainerCreate(ctx, imageCfg, hostCfg, nil, containerName)
if err != nil {
return container.ContainerCreateCreatedBody{}, fmt.Errorf("Failed to create [%s] container on host [%s]: %v", containerName, hostname, err)
var err error
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
created, err := dClient.ContainerCreate(ctx, imageCfg, hostCfg, nil, containerName)
if err != nil {
logrus.Warningf("Failed to create Docker container [%s] on host [%s]: %v", containerName, hostname, err)
continue
}
return created, nil
}
return created, nil
return container.ContainerCreateCreatedBody{}, fmt.Errorf("Failed to create Docker container [%s] on host [%s]: %v", containerName, hostname, err)
}
func InspectContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) (types.ContainerJSON, error) {
if dClient == nil {
return types.ContainerJSON{}, fmt.Errorf("Failed to inspect container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
inspection, err := dClient.ContainerInspect(ctx, containerName)
if err != nil {
return types.ContainerJSON{}, fmt.Errorf("Failed to inspect [%s] container on host [%s]: %v", containerName, hostname, err)
var err error
// Retry up to RetryCount times to see if image exists
for i := 1; i <= RetryCount; i++ {
inspection, err := dClient.ContainerInspect(ctx, containerName)
if err != nil {
if client.IsErrNotFound(err) {
return types.ContainerJSON{}, err
}
logrus.Warningf("Failed to inspect Docker container [%s] on host [%s]: %v", containerName, hostname, err)
continue
}
return inspection, nil
}
return inspection, nil
return types.ContainerJSON{}, fmt.Errorf("Failed to inspect Docker container [%s] on host [%s]: %v", containerName, hostname, err)
}
func StopRenameContainer(ctx context.Context, dClient *client.Client, hostname string, oldContainerName string, newContainerName string) error {
@ -390,7 +477,7 @@ func WaitForContainer(ctx context.Context, dClient *client.Client, hostname stri
// 5 minutes timeout, especially for transferring snapshots
for retries := 0; retries < 300; retries++ {
log.Infof(ctx, "Waiting for [%s] container to exit on host [%s]", containerName, hostname)
container, err := dClient.ContainerInspect(ctx, containerName)
container, err := InspectContainer(ctx, dClient, hostname, containerName)
if err != nil {
return 1, fmt.Errorf("Could not inspect container [%s] on host [%s]: %s", containerName, hostname, err)
}
@ -483,7 +570,16 @@ func ReadContainerLogs(ctx context.Context, dClient *client.Client, containerNam
if dClient == nil {
return nil, fmt.Errorf("Failed reading container logs: docker client is nil for container [%s]", containerName)
}
return dClient.ContainerLogs(ctx, containerName, types.ContainerLogsOptions{Follow: follow, ShowStdout: true, ShowStderr: true, Timestamps: false, Tail: tail})
var err error
for i := 1; i <= RetryCount; i++ {
logs, err := dClient.ContainerLogs(ctx, containerName, types.ContainerLogsOptions{Follow: follow, ShowStdout: true, ShowStderr: true, Timestamps: false, Tail: tail})
if err != nil {
logrus.Warnf("Can't read container logs for container [%s]: %v", containerName, err)
continue
}
return logs, nil
}
return nil, err
}
func GetContainerLogsStdoutStderr(ctx context.Context, dClient *client.Client, containerName, tail string, follow bool) (string, string, error) {
@ -571,8 +667,7 @@ func DoRestartContainer(ctx context.Context, dClient *client.Client, containerNa
return fmt.Errorf("Failed to restart container: docker client is nil for container [%s] on host [%s]", containerName, hostname)
}
logrus.Debugf("[restart/%s] Checking if container is running on host [%s]", containerName, hostname)
// not using the wrapper to check if the error is a NotFound error
_, err := dClient.ContainerInspect(ctx, containerName)
_, err := InspectContainer(ctx, dClient, hostname, containerName)
if err != nil {
if client.IsErrNotFound(err) {
logrus.Debugf("[restart/%s] Container doesn't exist on host [%s]", containerName, hostname)

View File

@ -10,7 +10,6 @@ import (
"strings"
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/rancher/rke/docker"
"github.com/rancher/rke/hosts"
@ -94,15 +93,15 @@ func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string,
},
Privileged: true,
}
resp, err := host.DClient.ContainerCreate(ctx, imageCfg, hostCfg, nil, CrtDownloaderContainer)
_, err = docker.CreateContainer(ctx, host.DClient, host.Address, CrtDownloaderContainer, imageCfg, hostCfg)
if err != nil {
return fmt.Errorf("Failed to create Certificates deployer container on host [%s]: %v", host.Address, err)
}
if err := host.DClient.ContainerStart(ctx, resp.ID, types.ContainerStartOptions{}); err != nil {
if err := docker.StartContainer(ctx, host.DClient, host.Address, CrtDownloaderContainer); err != nil {
return fmt.Errorf("Failed to start Certificates deployer container on host [%s]: %v", host.Address, err)
}
logrus.Debugf("[certificates] Successfully started Certificate deployer container: %s", resp.ID)
logrus.Debugf("[certificates] Successfully started Certificate deployer container: %s", CrtDownloaderContainer)
for {
isDeployerRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, CrtDownloaderContainer, false)
if err != nil {
@ -112,7 +111,7 @@ func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string,
time.Sleep(5 * time.Second)
continue
}
if err := host.DClient.ContainerRemove(ctx, resp.ID, types.ContainerRemoveOptions{RemoveVolumes: true}); err != nil {
if err := docker.RemoveContainer(ctx, host.DClient, host.Address, CrtDownloaderContainer); err != nil {
return fmt.Errorf("Failed to delete Certificates deployer container on host [%s]: %v", host.Address, err)
}
return nil

View File

@ -271,7 +271,6 @@ func IsEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*hosts.
}
func RunEtcdSnapshotSave(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdSnapshotImage string, name string, once bool, es v3.ETCDService) error {
log.Infof(ctx, "[etcd] Saving snapshot [%s] on host [%s]", name, etcdHost.Address)
backupCmd := "etcd-backup"
restartPolicy := "always"
imageCfg := &container.Config{
@ -288,9 +287,11 @@ func RunEtcdSnapshotSave(ctx context.Context, etcdHost *hosts.Host, prsMap map[s
Image: etcdSnapshotImage,
}
if once {
log.Infof(ctx, "[etcd] Running snapshot save once on host [%s]", etcdHost.Address)
imageCfg.Cmd = append(imageCfg.Cmd, "--once")
restartPolicy = "no"
} else if es.BackupConfig == nil {
log.Infof(ctx, "[etcd] Running snapshot container [%s] on host [%s]", EtcdSnapshotOnceContainerName, etcdHost.Address)
imageCfg.Cmd = append(imageCfg.Cmd, "--retention="+es.Retention)
imageCfg.Cmd = append(imageCfg.Cmd, "--creation="+es.Creation)
}