Merge pull request #11791 from mbforbes/restartLogging

Improve logging and retries for a gcloud command
This commit is contained in:
Jerzy Szczepkowski 2015-08-21 10:29:36 +02:00
commit 958191f72a

View File

@ -302,19 +302,13 @@ var _ = Describe("Skipped", func() {
Logf("Node template %s is still in use; not cleaning up", tmplBefore) Logf("Node template %s is still in use; not cleaning up", tmplBefore)
return return
} }
// TODO(mbforbes): Distinguish between transient failures
// and "cannot delete--in use" errors and retry on the
// former.
// TODO(mbforbes): Call this with retryCmd().
Logf("Deleting node template %s", tmplBefore) Logf("Deleting node template %s", tmplBefore)
o, err := exec.Command("gcloud", "compute", "instance-templates", if _, _, err := retryCmd("gcloud", "compute", "instance-templates",
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
"delete", "delete",
tmplBefore).CombinedOutput() tmplBefore); err != nil {
if err != nil { Logf("gcloud compute instance-templates delete %s call failed with err: %v", tmplBefore, err)
Logf("gcloud compute instance-templates delete %s call failed with err: %v, output: %s", Logf("May have leaked instance template %q", tmplBefore)
tmplBefore, err, string(o))
Logf("May have leaked %s", tmplBefore)
} }
} }
}) })
@ -522,19 +516,19 @@ func migTemplate() (string, error) {
var errLast error var errLast error
var templ string var templ string
key := "instanceTemplate" key := "instanceTemplate"
// TODO(mbforbes): Refactor this to use cluster_upgrade.go:retryCmd(...)
if wait.Poll(poll, singleCallTimeout, func() (bool, error) { if wait.Poll(poll, singleCallTimeout, func() (bool, error) {
// TODO(mbforbes): make this hit the compute API directly instead of // TODO(mbforbes): make this hit the compute API directly instead of
// shelling out to gcloud. // shelling out to gcloud.
o, err := exec.Command("gcloud", "compute", "instance-groups", "managed", // An `instance-groups managed describe` call outputs what we want to stdout.
"describe", testContext.CloudConfig.NodeInstanceGroup, output, _, err := retryCmd("gcloud", "compute", "instance-groups", "managed",
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone)).CombinedOutput() fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
"describe",
testContext.CloudConfig.NodeInstanceGroup)
if err != nil { if err != nil {
errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err) errLast = fmt.Errorf("gcloud compute instance-groups managed describe call failed with err: %v", err)
return false, nil return false, nil
} }
output := string(o)
// The 'describe' call probably succeeded; parse the output and try to // The 'describe' call probably succeeded; parse the output and try to
// find the line that looks like "instanceTemplate: url/to/<templ>" and // find the line that looks like "instanceTemplate: url/to/<templ>" and
@ -560,13 +554,13 @@ func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
var errLast error var errLast error
var id string var id string
prefix, suffix := "Started [", "]." prefix, suffix := "Started [", "]."
// TODO(mbforbes): Refactor this to use cluster_upgrade.go:retryCmd(...)
if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) { if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
// TODO(mbforbes): make this hit the compute API directly instead of // TODO(mbforbes): make this hit the compute API directly instead of
// shelling out to gcloud. // shelling out to gcloud.
// NOTE(mbforbes): If you are changing this gcloud command, update // NOTE(mbforbes): If you are changing this gcloud command, update
// cluster/gce/upgrade.sh to match this EXACTLY. // cluster/gce/upgrade.sh to match this EXACTLY.
o, err := exec.Command("gcloud", append(migUdpateCmdBase(), // A `rolling-updates start` call outputs what we want to stderr.
_, output, err := retryCmd("gcloud", append(migUdpateCmdBase(),
"rolling-updates", "rolling-updates",
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
@ -580,12 +574,11 @@ func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
// --max-num-concurrent-instances. // --max-num-concurrent-instances.
fmt.Sprintf("--max-num-concurrent-instances=%d", 1), fmt.Sprintf("--max-num-concurrent-instances=%d", 1),
fmt.Sprintf("--max-num-failed-instances=%d", 0), fmt.Sprintf("--max-num-failed-instances=%d", 0),
fmt.Sprintf("--min-instance-update-time=%ds", 0))...).CombinedOutput() fmt.Sprintf("--min-instance-update-time=%ds", 0))...)
if err != nil { if err != nil {
errLast = fmt.Errorf("rolling-updates call failed with err: %v", err) errLast = fmt.Errorf("rolling-updates call failed with err: %v", err)
return false, nil return false, nil
} }
output := string(o)
// The 'start' call probably succeeded; parse the output and try to find // The 'start' call probably succeeded; parse the output and try to find
// the line that looks like "Started [url/to/<id>]." and return <id>. // the line that looks like "Started [url/to/<id>]." and return <id>.
@ -637,20 +630,19 @@ func migRollingUpdatePoll(id string, nt time.Duration) error {
start, timeout := time.Now(), nt*time.Duration(testContext.CloudConfig.NumNodes) start, timeout := time.Now(), nt*time.Duration(testContext.CloudConfig.NumNodes)
var errLast error var errLast error
Logf("Waiting up to %v for MIG rolling update to complete.", timeout) Logf("Waiting up to %v for MIG rolling update to complete.", timeout)
// TODO(mbforbes): Refactor this to use cluster_upgrade.go:retryCmd(...)
if wait.Poll(restartPoll, timeout, func() (bool, error) { if wait.Poll(restartPoll, timeout, func() (bool, error) {
o, err := exec.Command("gcloud", append(migUdpateCmdBase(), // A `rolling-updates describe` call outputs what we want to stdout.
output, _, err := retryCmd("gcloud", append(migUdpateCmdBase(),
"rolling-updates", "rolling-updates",
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
"describe", "describe",
id)...).CombinedOutput() id)...)
if err != nil { if err != nil {
errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err) errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err)
Logf("%v", errLast) Logf("%v", errLast)
return false, nil return false, nil
} }
output := string(o)
// The 'describe' call probably succeeded; parse the output and try to // The 'describe' call probably succeeded; parse the output and try to
// find the line that looks like "status: <status>" and see whether it's // find the line that looks like "status: <status>" and see whether it's