diff --git a/test/e2e/util.go b/test/e2e/util.go index d74490da938..0e3559d737d 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -50,6 +50,9 @@ const ( // Initial pod start can be delayed O(minutes) by slow docker pulls // TODO: Make this 30 seconds once #4566 is resolved. podStartTimeout = 5 * time.Minute + + // String used to mark pod deletion + nonExist = "NonExist" ) type TestContextType struct { @@ -409,6 +412,71 @@ func testContainerOutputInNamespace(scenarioName string, c *client.Client, pod * } } +// podInfo contains pod information useful for debugging e2e tests. +type podInfo struct { + oldHostname string + oldPhase string + hostname string + phase string +} + +// PodDiff is a map of pod name to podInfos +type PodDiff map[string]*podInfo + +// Print formats and prints the give PodDiff. +func (p PodDiff) Print(ignorePhases util.StringSet) { + for name, info := range p { + if ignorePhases.Has(info.phase) { + continue + } + if info.phase == nonExist { + Logf("Pod %v was deleted, had phase %v and host %v", name, info.phase, info.hostname) + continue + } + phaseChange, hostChange := false, false + msg := fmt.Sprintf("Pod %v ", name) + if info.oldPhase != info.phase { + phaseChange = true + if info.oldPhase == nonExist { + msg += fmt.Sprintf("in phase %v ", info.phase) + } else { + msg += fmt.Sprintf("went from phase: %v -> %v ", info.oldPhase, info.phase) + } + } + if info.oldHostname != info.hostname { + hostChange = true + if info.oldHostname == nonExist || info.oldHostname == "" { + msg += fmt.Sprintf("assigned host %v ", info.hostname) + } else { + msg += fmt.Sprintf("went from host: %v -> %v ", info.oldHostname, info.hostname) + } + } + if phaseChange || hostChange { + Logf(msg) + } + } +} + +// Diff computes a PodDiff given 2 lists of pods. +func Diff(oldPods *api.PodList, curPods *api.PodList) PodDiff { + podInfoMap := PodDiff{} + + // New pods will show up in the curPods list but not in oldPods. They have oldhostname/phase == nonexist. + for _, pod := range curPods.Items { + podInfoMap[pod.Name] = &podInfo{hostname: pod.Spec.Host, phase: string(pod.Status.Phase), oldHostname: nonExist, oldPhase: nonExist} + } + + // Deleted pods will show up in the oldPods list but not in curPods. They have a hostname/phase == nonexist. + for _, pod := range oldPods.Items { + if info, ok := podInfoMap[pod.Name]; ok { + info.oldHostname, info.oldPhase = pod.Spec.Host, string(pod.Status.Phase) + } else { + podInfoMap[pod.Name] = &podInfo{hostname: nonExist, phase: nonExist, oldHostname: pod.Spec.Host, oldPhase: string(pod.Status.Phase)} + } + } + return podInfoMap +} + // RunRC Launches (and verifies correctness) of a Replication Controller // It will waits for all pods it spawns to become "Running". // It's the caller's responsibility to clean up externally (i.e. use the @@ -420,8 +488,8 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error current := 0 same := 0 - By(fmt.Sprintf("Creating replication controller %s in namespace %s", name, ns)) - _, err := c.ReplicationControllers(ns).Create(&api.ReplicationController{ + By(fmt.Sprintf("Creating replication controller %s", name)) + rc := &api.ReplicationController{ ObjectMeta: api.ObjectMeta{ Name: name, }, @@ -445,10 +513,12 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error }, }, }, - }) + } + _, err := c.ReplicationControllers(ns).Create(rc) if err != nil { return fmt.Errorf("Error creating replication controller: %v", err) } + Logf("Created replication controller with name: %v, namespace: %v, replica count: %v", rc.Name, ns, rc.Spec.Replicas) By(fmt.Sprintf("Making sure all %d replicas of rc %s in namespace %s exist", replicas, name, ns)) label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) @@ -483,27 +553,28 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error if current != replicas { return fmt.Errorf("Controller %s: Only found %d replicas out of %d", name, current, replicas) } - Logf("Controller %s: Found %d pods out of %d", name, current, replicas) + Logf("Controller %s in ns %s: Found %d pods out of %d", name, ns, current, replicas) By(fmt.Sprintf("Waiting for all %d replicas to be running with a max container failures of %d", replicas, maxContainerFailures)) same = 0 last = 0 failCount = 10 current = 0 + oldPods := &api.PodList{} for same < failCount && current < replicas { current = 0 waiting := 0 pending := 0 unknown := 0 + inactive := 0 time.Sleep(10 * time.Second) + // TODO: Use a reflector both to put less strain on the cluster and + // for more clarity. currentPods, err := listPods(c, ns, label, fields.Everything()) if err != nil { return fmt.Errorf("Error listing pods: %v", err) } - if len(currentPods.Items) != len(pods.Items) { - return fmt.Errorf("Number of reported pods changed: %d vs %d", len(currentPods.Items), len(pods.Items)) - } for _, p := range currentPods.Items { if p.Status.Phase == api.PodRunning { current++ @@ -516,22 +587,49 @@ func RunRC(c *client.Client, name string, ns, image string, replicas int) error } else { pending++ } + } else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed { + inactive++ } else if p.Status.Phase == api.PodUnknown { unknown++ } } - Logf("Pod States: %d running, %d pending, %d waiting, %d unknown ", current, pending, waiting, unknown) + Logf("Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown ", current, pending, waiting, inactive, unknown) + + if len(currentPods.Items) != len(pods.Items) { + + // This failure mode includes: + // kubelet is dead, so node controller deleted pods and rc creates more + // - diagnose by noting the pod diff below. + // pod is unhealthy, so replication controller creates another to take its place + // - diagnose by comparing the previous "2 Pod states" lines for inactive pods + errorStr := fmt.Sprintf("Number of reported pods changed: %d vs %d", len(currentPods.Items), len(pods.Items)) + Logf("%v, pods that changed since the last iteration:", errorStr) + Diff(oldPods, currentPods).Print(util.NewStringSet()) + return fmt.Errorf(errorStr) + } if last < current { same = 0 } else if last == current { same++ } else if current < last { - return fmt.Errorf("Number of running pods dropped from %d to %d", last, current) + + // The pod failed or succeeded, or was somehow pushed out of running by the kubelet. + errorStr := fmt.Sprintf("Number of running pods dropped from %d to %d", last, current) + Logf("%v, pods that changed since the last iteration:", errorStr) + Diff(oldPods, currentPods).Print(util.NewStringSet()) + return fmt.Errorf(errorStr) } if same >= failCount { - return fmt.Errorf("No pods started for the last %d checks", failCount) + + // Most times this happens because a few nodes have kubelet problems, and their pods are + // stuck in pending. + errorStr := fmt.Sprintf("No pods started for the last %d checks", failCount) + Logf("%v, pods currently in pending:", errorStr) + Diff(currentPods, &api.PodList{}).Print(util.NewStringSet(string(api.PodRunning))) + return fmt.Errorf(errorStr) } last = current + oldPods = currentPods } if current != replicas { return fmt.Errorf("Only %d pods started out of %d", current, replicas)