Add startup latency e2e "test"

2025-09-27 21:26:03 +00:00 · 2015-06-03 16:56:16 +02:00
parent 43889c612c
commit c2c1045e09
2 changed files with 177 additions and 10 deletions
--- a/pkg/kubelet/dockertools/manager.go
+++ b/pkg/kubelet/dockertools/manager.go
@@ -1263,9 +1263,12 @@ func (dm *DockerManager) createPodInfraContainer(pod *api.Pod) (kubeletTypes.Doc
 			}
 			return "", err
 		}
 		if ref != nil {
 			dm.recorder.Eventf(ref, "pulled", "Successfully pulled Pod container image %q", container.Image)
 		}
 	}
-	if ref != nil {
+	if ok && ref != nil {
-		dm.recorder.Eventf(ref, "pulled", "Successfully pulled image %q", container.Image)
+		dm.recorder.Eventf(ref, "pulled", "Pod container image %q already present on machine", container.Image)
 	}
 	id, err := dm.runContainerInPod(pod, container, netNamespace, "")
@@ -1494,7 +1497,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod
 	// Start everything
 	for idx := range containerChanges.ContainersToStart {
 		container := &pod.Spec.Containers[idx]
-		glog.V(4).Infof("Creating container %+v", container)
+		glog.V(4).Infof("Creating container %+v in pod %v", container, podFullName)
 		err := dm.pullImage(pod, container, pullSecrets)
 		dm.updateReasonCache(pod, container, err)
 		if err != nil {
--- a/test/e2e/density.go
+++ b/test/e2e/density.go
@@ -20,7 +20,9 @@ import (
 	"fmt"
 	"math"
 	"os"
 	"sort"
 	"strconv"
 	"sync"
 	"time"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
@@ -37,6 +39,25 @@ import (
 	. "github.com/onsi/gomega"
 )
 type podLatencyData struct {
 	Name    string
 	Latency time.Duration
 }
 type latencySlice []podLatencyData
 func (a latencySlice) Len() int           { return len(a) }
 func (a latencySlice) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a latencySlice) Less(i, j int) bool { return a[i].Latency < a[j].Latency }
 func printLatencies(latencies []podLatencyData, header string) {
 	perc50 := latencies[len(latencies)/2].Latency
 	perc90 := latencies[(len(latencies)*9)/10].Latency
 	perc99 := latencies[(len(latencies)*99)/100].Latency
 	Logf("10%% %s: %v", header, latencies[(len(latencies)*9)/10:len(latencies)])
 	Logf("perc50: %v, perc90: %v, perc99: %v", perc50, perc90, perc99)
 }
 // This test suite can take a long time to run, so by default it is added to
 // the ginkgo.skip list (see driver.go).
 // To run this suite you must explicitly ask for it by setting the
@@ -45,6 +66,7 @@ var _ = Describe("Density", func() {
 	var c *client.Client
 	var minionCount int
 	var RCName string
 	var additionalRCName string
 	var ns string
 	var uuid string
@@ -82,6 +104,13 @@ var _ = Describe("Density", func() {
 			expectNoError(err)
 		}
 		rc, err = c.ReplicationControllers(ns).Get(additionalRCName)
 		if err == nil && rc.Spec.Replicas != 0 {
 			By("Cleaning up the replication controller")
 			err := DeleteRC(c, ns, additionalRCName)
 			expectNoError(err)
 		}
 		By(fmt.Sprintf("Destroying namespace for this suite %v", ns))
 		if err := c.Namespaces().Delete(ns); err != nil {
 			Failf("Couldn't delete ns %s", err)
@@ -99,9 +128,11 @@ var _ = Describe("Density", func() {
 	// Tests with "Skipped" substring in their name will be skipped when running
 	// e2e test suite without --ginkgo.focus & --ginkgo.skip flags.
 	type Density struct {
-		skip          bool
+		skip bool
-		podsPerMinion int
+		// Controls if e2e latency tests should be run (they are slow)
-		/* Controls how often the apiserver is polled for pods */
+		runLatencyTest bool
 		podsPerMinion  int
 		// Controls how often the apiserver is polled for pods
 		interval time.Duration
 	}
@@ -109,12 +140,13 @@ var _ = Describe("Density", func() {
 		// This test should not be run in a regular jenkins run, because it is not isolated enough
 		// (metrics from other tests affects this one).
 		// TODO: Reenable once we can measure latency only from a single test.
-		{podsPerMinion: 3, skip: true, interval: 10 * time.Second},
+		// TODO: Expose runLatencyTest as ginkgo flag.
-		{podsPerMinion: 30, skip: true, interval: 10 * time.Second},
+		{podsPerMinion: 3, skip: true, runLatencyTest: false, interval: 10 * time.Second},
 		{podsPerMinion: 30, skip: true, runLatencyTest: false, interval: 10 * time.Second},
 		// More than 30 pods per node is outside our v1.0 goals.
 		// We might want to enable those tests in the future.
-		{podsPerMinion: 50, skip: true, interval: 10 * time.Second},
+		{podsPerMinion: 50, skip: true, runLatencyTest: false, interval: 10 * time.Second},
-		{podsPerMinion: 100, skip: true, interval: 1 * time.Second},
+		{podsPerMinion: 100, skip: true, runLatencyTest: false, interval: 1 * time.Second},
 	}
 	for _, testArg := range densityTests {
@@ -188,6 +220,138 @@ var _ = Describe("Density", func() {
 			// Tune the threshold for allowed failures.
 			badEvents := BadEvents(events)
 			Expect(badEvents).NotTo(BeNumerically(">", int(math.Floor(0.01*float64(totalPods)))))
 			if itArg.runLatencyTest {
 				Logf("Schedling additional Pods to measure startup latencies")
 				createTimes := make(map[string]util.Time, 0)
 				scheduleTimes := make(map[string]util.Time, 0)
 				runTimes := make(map[string]util.Time, 0)
 				watchTimes := make(map[string]util.Time, 0)
 				var mutex sync.Mutex
 				checkPod := func(p *api.Pod) {
 					mutex.Lock()
 					defer mutex.Unlock()
 					defer GinkgoRecover()
 					if p.Status.Phase == api.PodRunning {
 						if _, found := watchTimes[p.Name]; !found {
 							watchTimes[p.Name] = util.Now()
 							createTimes[p.Name] = p.CreationTimestamp
 							var startTime util.Time
 							for _, cs := range p.Status.ContainerStatuses {
 								if cs.State.Running != nil {
 									if startTime.Before(cs.State.Running.StartedAt) {
 										startTime = cs.State.Running.StartedAt
 									}
 								}
 							}
 							if startTime != util.NewTime(time.Time{}) {
 								runTimes[p.Name] = startTime
 							} else {
 								Failf("Pod %v is reported to be running, but none of its containers is", p.Name)
 							}
 						}
 					}
 				}
 				additionalNameStr := strconv.Itoa(minionCount) + "-" + string(util.NewUUID())
 				additionalRCName = "my-hostname-latency" + additionalNameStr
 				_, controller := framework.NewInformer(
 					&cache.ListWatch{
 						ListFunc: func() (runtime.Object, error) {
 							return c.Pods(ns).List(labels.SelectorFromSet(labels.Set{"name": additionalRCName}), fields.Everything())
 						},
 						WatchFunc: func(rv string) (watch.Interface, error) {
 							return c.Pods(ns).Watch(labels.SelectorFromSet(labels.Set{"name": additionalRCName}), fields.Everything(), rv)
 						},
 					},
 					&api.Pod{},
 					time.Minute*5,
 					framework.ResourceEventHandlerFuncs{
 						AddFunc: func(obj interface{}) {
 							p, ok := obj.(*api.Pod)
 							Expect(ok).To(Equal(true))
 							go checkPod(p)
 						},
 						UpdateFunc: func(oldObj, newObj interface{}) {
 							p, ok := newObj.(*api.Pod)
 							Expect(ok).To(Equal(true))
 							go checkPod(p)
 						},
 					},
 				)
 				stopCh := make(chan struct{})
 				go controller.Run(stopCh)
 				config = RCConfig{Client: c,
 					Image:        "gcr.io/google_containers/pause:go",
 					Name:         additionalRCName,
 					Namespace:    ns,
 					PollInterval: itArg.interval,
 					Replicas:     minionCount,
 				}
 				expectNoError(RunRC(config))
 				Logf("Waiting for all Pods begin observed by the watch...")
 				for start := time.Now(); len(watchTimes) < minionCount && time.Since(start) < timeout; time.Sleep(10 * time.Second) {
 				}
 				close(stopCh)
 				schedEvents, err := c.Events(ns).List(
 					labels.Everything(),
 					fields.Set{
 						"involvedObject.kind":      "Pod",
 						"involvedObject.namespace": ns,
 						"source":                   "scheduler",
 					}.AsSelector())
 				expectNoError(err)
 				for k := range createTimes {
 					for _, event := range schedEvents.Items {
 						if event.InvolvedObject.Name == k {
 							scheduleTimes[k] = event.FirstTimestamp
 							break
 						}
 					}
 				}
 				scheduleLag := make([]podLatencyData, 0)
 				startupLag := make([]podLatencyData, 0)
 				watchLag := make([]podLatencyData, 0)
 				schedToWatchLag := make([]podLatencyData, 0)
 				e2eLag := make([]podLatencyData, 0)
 				for name, create := range createTimes {
 					sched, ok := scheduleTimes[name]
 					Expect(ok).To(Equal(true))
 					run, ok := runTimes[name]
 					Expect(ok).To(Equal(true))
 					watch, ok := watchTimes[name]
 					Expect(ok).To(Equal(true))
 					scheduleLag = append(scheduleLag, podLatencyData{name, sched.Time.Sub(create.Time)})
 					startupLag = append(startupLag, podLatencyData{name, run.Time.Sub(sched.Time)})
 					watchLag = append(watchLag, podLatencyData{name, watch.Time.Sub(run.Time)})
 					schedToWatchLag = append(schedToWatchLag, podLatencyData{name, watch.Time.Sub(sched.Time)})
 					e2eLag = append(e2eLag, podLatencyData{name, watch.Time.Sub(create.Time)})
 				}
 				sort.Sort(latencySlice(scheduleLag))
 				sort.Sort(latencySlice(startupLag))
 				sort.Sort(latencySlice(watchLag))
 				sort.Sort(latencySlice(schedToWatchLag))
 				sort.Sort(latencySlice(e2eLag))
 				printLatencies(scheduleLag, "worst schedule latencies")
 				printLatencies(startupLag, "worst run-after-schedule latencies")
 				printLatencies(watchLag, "worst watch latencies")
 				printLatencies(schedToWatchLag, "worst scheduled-to-end total latencies")
 				printLatencies(e2eLag, "worst e2e total latencies")
 				Logf("Approx throughput: %v pods/min",
 					float64(minionCount)/(e2eLag[len(e2eLag)-1].Latency.Minutes()))
 			}
 		})
 	}
 })