diff --git a/test/e2e/density.go b/test/e2e/density.go index b76a86b18da..291d0e42f80 100644 --- a/test/e2e/density.go +++ b/test/e2e/density.go @@ -84,6 +84,26 @@ func density30AddonResourceVerifier() map[string]framework.ResourceConstraint { return constraints } +func logPodStartupStatus(c *client.Client, expectedPods int, ns string, observedLabels map[string]string, period time.Duration, stopCh chan struct{}) { + label := labels.SelectorFromSet(labels.Set(observedLabels)) + podStore := framework.NewPodStore(c, ns, label, fields.Everything()) + defer podStore.Stop() + ticker := time.NewTicker(period) + for { + select { + case <-ticker.C: + pods := podStore.List() + startupStatus := framework.ComputeRCStartupStatus(pods, expectedPods) + startupStatus.Print("Density") + case <-stopCh: + pods := podStore.List() + startupStatus := framework.ComputeRCStartupStatus(pods, expectedPods) + startupStatus.Print("Density") + return + } + } +} + // This test suite can take a long time to run, and can affect or be affected by other tests. // So by default it is added to the ginkgo.skip list (see driver.go). // To run this suite you must explicitly ask for it by setting the @@ -185,7 +205,7 @@ var _ = framework.KubeDescribe("Density", func() { {podsPerNode: 30, runLatencyTest: true, interval: 10 * time.Second}, {podsPerNode: 50, runLatencyTest: false, interval: 10 * time.Second}, {podsPerNode: 95, runLatencyTest: true, interval: 10 * time.Second}, - {podsPerNode: 100, runLatencyTest: false, interval: 1 * time.Second}, + {podsPerNode: 100, runLatencyTest: false, interval: 10 * time.Second}, } for _, testArg := range densityTests { @@ -201,22 +221,29 @@ var _ = framework.KubeDescribe("Density", func() { } itArg := testArg It(name, func() { - podsPerNode := itArg.podsPerNode - totalPods = podsPerNode * nodeCount - RCName = "density" + strconv.Itoa(totalPods) + "-" + uuid fileHndl, err := os.Create(fmt.Sprintf(framework.TestContext.OutputDir+"/%s/pod_states.csv", uuid)) framework.ExpectNoError(err) defer fileHndl.Close() - config := framework.RCConfig{Client: c, - Image: "gcr.io/google_containers/pause:2.0", - Name: RCName, - Namespace: ns, - PollInterval: itArg.interval, - PodStatusFile: fileHndl, - Replicas: totalPods, - CpuRequest: nodeCpuCapacity / 100, - MemRequest: nodeMemCapacity / 100, - MaxContainerFailures: &MaxContainerFailures, + podsPerNode := itArg.podsPerNode + totalPods = podsPerNode * nodeCount + // TODO: loop to podsPerNode instead of 1 when we're ready. + numberOrRCs := 1 + RCConfigs := make([]framework.RCConfig, numberOrRCs) + for i := 0; i < numberOrRCs; i++ { + RCName = "density" + strconv.Itoa(totalPods) + "-" + strconv.Itoa(i) + "-" + uuid + RCConfigs[i] = framework.RCConfig{Client: c, + Image: "gcr.io/google_containers/pause:2.0", + Name: RCName, + Namespace: ns, + Labels: map[string]string{"type": "densityPod"}, + PollInterval: itArg.interval, + PodStatusFile: fileHndl, + Replicas: (totalPods + numberOrRCs - 1) / numberOrRCs, + CpuRequest: nodeCpuCapacity / 100, + MemRequest: nodeMemCapacity / 100, + MaxContainerFailures: &MaxContainerFailures, + Silent: true, + } } // Create a listener for events. @@ -249,7 +276,7 @@ var _ = framework.KubeDescribe("Density", func() { // uLock is a lock protects the updateCount var uLock sync.Mutex updateCount := 0 - label := labels.SelectorFromSet(labels.Set(map[string]string{"name": RCName})) + label := labels.SelectorFromSet(labels.Set(map[string]string{"type": "densityPod"})) _, updateController := controllerframework.NewInformer( &cache.ListWatch{ ListFunc: func(options api.ListOptions) (runtime.Object, error) { @@ -273,10 +300,22 @@ var _ = framework.KubeDescribe("Density", func() { ) go updateController.Run(stop) - // Start the replication controller. + // Start all replication controllers. startTime := time.Now() - framework.ExpectNoError(framework.RunRC(config)) + wg := sync.WaitGroup{} + wg.Add(len(RCConfigs)) + for i := range RCConfigs { + rcConfig := RCConfigs[i] + go func() { + framework.ExpectNoError(framework.RunRC(rcConfig)) + wg.Done() + }() + } + logStopCh := make(chan struct{}) + go logPodStartupStatus(c, totalPods, ns, map[string]string{"type": "densityPod"}, itArg.interval, logStopCh) + wg.Wait() e2eStartupTime = time.Now().Sub(startTime) + close(logStopCh) framework.Logf("E2E startup time for %d pods: %v", totalPods, e2eStartupTime) framework.Logf("Throughput (pods/s) during cluster saturation phase: %v", float32(totalPods)/float32(e2eStartupTime/time.Second)) @@ -506,11 +545,14 @@ var _ = framework.KubeDescribe("Density", func() { By("Deleting ReplicationController") // We explicitly delete all pods to have API calls necessary for deletion accounted in metrics. - rc, err := c.ReplicationControllers(ns).Get(RCName) - if err == nil && rc.Spec.Replicas != 0 { - By("Cleaning up the replication controller") - err := framework.DeleteRC(c, ns, RCName) - framework.ExpectNoError(err) + for i := range RCConfigs { + rcName := RCConfigs[i].Name + rc, err := c.ReplicationControllers(ns).Get(rcName) + if err == nil && rc.Spec.Replicas != 0 { + By("Cleaning up the replication controller") + err := framework.DeleteRC(c, ns, rcName) + framework.ExpectNoError(err) + } } By("Removing additional replication controllers if any") diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 72670c13463..5c47c92db68 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -242,6 +242,9 @@ type RCConfig struct { // Maximum allowable container failures. If exceeded, RunRC returns an error. // Defaults to replicas*0.1 if unspecified. MaxContainerFailures *int + + // If set to false starting RC will print progress, otherwise only errors will be printed. + Silent bool } type DeploymentConfig struct { @@ -1934,6 +1937,70 @@ func (config *RCConfig) applyTo(template *api.PodTemplateSpec) { } } +type RCStartupStatus struct { + Expected int + Terminating int + Running int + RunningButNotReady int + Waiting int + Pending int + Unknown int + Inactive int + FailedContainers int + Created []*api.Pod + ContainerRestartNodes sets.String +} + +func (s *RCStartupStatus) Print(name string) { + Logf("%v Pods: %d out of %d created, %d running, %d pending, %d waiting, %d inactive, %d terminating, %d unknown, %d runningButNotReady ", + name, len(s.Created), s.Expected, s.Running, s.Pending, s.Waiting, s.Inactive, s.Terminating, s.Unknown, s.RunningButNotReady) +} + +func ComputeRCStartupStatus(pods []*api.Pod, expected int) RCStartupStatus { + startupStatus := RCStartupStatus{ + Expected: expected, + Created: make([]*api.Pod, 0, expected), + ContainerRestartNodes: sets.NewString(), + } + for _, p := range pods { + if p.DeletionTimestamp != nil { + startupStatus.Terminating++ + continue + } + startupStatus.Created = append(startupStatus.Created, p) + if p.Status.Phase == api.PodRunning { + ready := false + for _, c := range p.Status.Conditions { + if c.Type == api.PodReady && c.Status == api.ConditionTrue { + ready = true + break + } + } + if ready { + // Only count a pod is running when it is also ready. + startupStatus.Running++ + } else { + startupStatus.RunningButNotReady++ + } + for _, v := range FailedContainers(p) { + startupStatus.FailedContainers = startupStatus.FailedContainers + v.Restarts + startupStatus.ContainerRestartNodes.Insert(p.Spec.NodeName) + } + } else if p.Status.Phase == api.PodPending { + if p.Spec.NodeName == "" { + startupStatus.Waiting++ + } else { + startupStatus.Pending++ + } + } else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed { + startupStatus.Inactive++ + } else if p.Status.Phase == api.PodUnknown { + startupStatus.Unknown++ + } + } + return startupStatus +} + func (config *RCConfig) start() error { // Don't force tests to fail if they don't care about containers restarting. var maxContainerFailures int @@ -1962,74 +2029,28 @@ func (config *RCConfig) start() error { for oldRunning != config.Replicas { time.Sleep(interval) - terminating := 0 - - running := 0 - runningButNotReady := 0 - waiting := 0 - pending := 0 - unknown := 0 - inactive := 0 - failedContainers := 0 - containerRestartNodes := sets.NewString() - pods := PodStore.List() - created := []*api.Pod{} - for _, p := range pods { - if p.DeletionTimestamp != nil { - terminating++ - continue - } - created = append(created, p) - if p.Status.Phase == api.PodRunning { - ready := false - for _, c := range p.Status.Conditions { - if c.Type == api.PodReady && c.Status == api.ConditionTrue { - ready = true - break - } - } - if ready { - // Only count a pod is running when it is also ready. - running++ - } else { - runningButNotReady++ - } - for _, v := range FailedContainers(p) { - failedContainers = failedContainers + v.Restarts - containerRestartNodes.Insert(p.Spec.NodeName) - } - } else if p.Status.Phase == api.PodPending { - if p.Spec.NodeName == "" { - waiting++ - } else { - pending++ - } - } else if p.Status.Phase == api.PodSucceeded || p.Status.Phase == api.PodFailed { - inactive++ - } else if p.Status.Phase == api.PodUnknown { - unknown++ - } - } - pods = created + startupStatus := ComputeRCStartupStatus(pods, config.Replicas) + + pods = startupStatus.Created if config.CreatedPods != nil { *config.CreatedPods = pods } - - Logf("%v Pods: %d out of %d created, %d running, %d pending, %d waiting, %d inactive, %d terminating, %d unknown, %d runningButNotReady ", - config.Name, len(pods), config.Replicas, running, pending, waiting, inactive, terminating, unknown, runningButNotReady) - - promPushRunningPending(running, pending) - - if config.PodStatusFile != nil { - fmt.Fprintf(config.PodStatusFile, "%d, running, %d, pending, %d, waiting, %d, inactive, %d, unknown, %d, runningButNotReady\n", running, pending, waiting, inactive, unknown, runningButNotReady) + if !config.Silent { + startupStatus.Print(config.Name) } - if failedContainers > maxContainerFailures { - DumpNodeDebugInfo(config.Client, containerRestartNodes.List()) + promPushRunningPending(startupStatus.Running, startupStatus.Pending) + + if config.PodStatusFile != nil { + fmt.Fprintf(config.PodStatusFile, "%d, running, %d, pending, %d, waiting, %d, inactive, %d, unknown, %d, runningButNotReady\n", startupStatus.Running, startupStatus.Pending, startupStatus.Waiting, startupStatus.Inactive, startupStatus.Unknown, startupStatus.RunningButNotReady) + } + + if startupStatus.FailedContainers > maxContainerFailures { + DumpNodeDebugInfo(config.Client, startupStatus.ContainerRestartNodes.List()) // Get the logs from the failed containers to help diagnose what caused them to fail LogFailedContainers(config.Namespace) - return fmt.Errorf("%d containers failed which is more than allowed %d", failedContainers, maxContainerFailures) + return fmt.Errorf("%d containers failed which is more than allowed %d", startupStatus.FailedContainers, maxContainerFailures) } if len(pods) < len(oldPods) || len(pods) > config.Replicas { // This failure mode includes: @@ -2043,11 +2064,11 @@ func (config *RCConfig) start() error { return fmt.Errorf(errorStr) } - if len(pods) > len(oldPods) || running > oldRunning { + if len(pods) > len(oldPods) || startupStatus.Running > oldRunning { lastChange = time.Now() } oldPods = pods - oldRunning = running + oldRunning = startupStatus.Running if time.Since(lastChange) > timeout { dumpPodDebugInfo(config.Client, pods)