diff --git a/test/e2e/density.go b/test/e2e/density.go index 8a219ddf0e8..433e892dfbf 100644 --- a/test/e2e/density.go +++ b/test/e2e/density.go @@ -20,6 +20,7 @@ import ( "fmt" "math" "os" + "os/exec" "sort" "strconv" "sync" @@ -66,6 +67,20 @@ func printLatencies(latencies []podLatencyData, header string) { Logf("perc50: %v, perc90: %v, perc99: %v", perc50, perc90, perc99) } +// List nodes via gcloud. We don't rely on the apiserver because we really want the node ips +// and sometimes the node controller is slow to populate them. +func gcloudListNodes() { + Logf("Listing nodes via gcloud:") + output, err := exec.Command("gcloud", "compute", "instances", "list", + "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone).CombinedOutput() + if err != nil { + Logf("Failed to list nodes: %v, %v", err) + return + } + Logf(string(output)) + return +} + // This test suite can take a long time to run, so by default it is added to // the ginkgo.skip list (see driver.go). // To run this suite you must explicitly ask for it by setting the @@ -101,6 +116,7 @@ var _ = Describe("Density", func() { expectNoError(resetMetrics(c)) expectNoError(os.Mkdir(fmt.Sprintf(testContext.OutputDir+"/%s", uuid), 0777)) expectNoError(writePerfData(c, fmt.Sprintf(testContext.OutputDir+"/%s", uuid), "before")) + gcloudListNodes() }) AfterEach(func() { diff --git a/test/e2e/util.go b/test/e2e/util.go index d4ac486dc96..c217d82c243 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -1046,7 +1046,7 @@ func RunRC(config RCConfig) error { oldPods := make([]*api.Pod, 0) oldRunning := 0 lastChange := time.Now() - for oldRunning != config.Replicas && time.Since(lastChange) < timeout { + for oldRunning != config.Replicas { time.Sleep(interval) running := 0 @@ -1104,6 +1104,11 @@ func RunRC(config RCConfig) error { } oldPods = pods oldRunning = running + + if time.Since(lastChange) > timeout { + dumpPodDebugInfo(config.Client, pods) + break + } } if oldRunning != config.Replicas { @@ -1112,6 +1117,41 @@ func RunRC(config RCConfig) error { return nil } +func dumpPodDebugInfo(c *client.Client, pods []*api.Pod) { + badNodes := util.NewStringSet() + for _, p := range pods { + if p.Status.Phase != api.PodRunning { + if p.Spec.NodeName != "" { + Logf("Pod %v assigned to host %v (IP: %v) in %v", p.Name, p.Spec.NodeName, p.Status.HostIP, p.Status.Phase) + badNodes.Insert(p.Spec.NodeName) + } else { + Logf("Pod %v still unassigned", p.Name) + } + } + } + dumpNodeDebugInfo(c, badNodes.List()) +} + +func dumpNodeDebugInfo(c *client.Client, nodeNames []string) { + for _, n := range nodeNames { + Logf("\nLogging pods the kubelet thinks is on node %v", n) + podList, err := GetKubeletPods(c, n) + if err != nil { + Logf("Unable to retrieve kubelet pods for node %v", n) + continue + } + for _, p := range podList.Items { + Logf("%v started at %v (%d container statuses recorded)", p.Name, p.Status.StartTime, len(p.Status.ContainerStatuses)) + for _, c := range p.Status.ContainerStatuses { + Logf("\tContainer %v ready: %v, restart count %v", + c.Name, c.Ready, c.RestartCount) + } + } + HighLatencyKubeletOperations(c, 10*time.Second, n) + // TODO: Log node resource info + } +} + func ScaleRC(c *client.Client, ns, name string, size uint) error { By(fmt.Sprintf("%v Scaling replication controller %s in namespace %s to %d", time.Now(), name, ns, size)) scaler, err := kubectl.ScalerFor("ReplicationController", kubectl.NewScalerClient(c))