Add startup latency e2e "test"

This commit is contained in:
gmarek 2015-06-03 16:56:16 +02:00
parent 43889c612c
commit c2c1045e09
2 changed files with 177 additions and 10 deletions

View File

@ -1263,9 +1263,12 @@ func (dm *DockerManager) createPodInfraContainer(pod *api.Pod) (kubeletTypes.Doc
}
return "", err
}
if ref != nil {
dm.recorder.Eventf(ref, "pulled", "Successfully pulled Pod container image %q", container.Image)
}
}
if ref != nil {
dm.recorder.Eventf(ref, "pulled", "Successfully pulled image %q", container.Image)
if ok && ref != nil {
dm.recorder.Eventf(ref, "pulled", "Pod container image %q already present on machine", container.Image)
}
id, err := dm.runContainerInPod(pod, container, netNamespace, "")
@ -1494,7 +1497,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod
// Start everything
for idx := range containerChanges.ContainersToStart {
container := &pod.Spec.Containers[idx]
glog.V(4).Infof("Creating container %+v", container)
glog.V(4).Infof("Creating container %+v in pod %v", container, podFullName)
err := dm.pullImage(pod, container, pullSecrets)
dm.updateReasonCache(pod, container, err)
if err != nil {

View File

@ -20,7 +20,9 @@ import (
"fmt"
"math"
"os"
"sort"
"strconv"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
@ -37,6 +39,25 @@ import (
. "github.com/onsi/gomega"
)
type podLatencyData struct {
Name string
Latency time.Duration
}
type latencySlice []podLatencyData
func (a latencySlice) Len() int { return len(a) }
func (a latencySlice) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a latencySlice) Less(i, j int) bool { return a[i].Latency < a[j].Latency }
func printLatencies(latencies []podLatencyData, header string) {
perc50 := latencies[len(latencies)/2].Latency
perc90 := latencies[(len(latencies)*9)/10].Latency
perc99 := latencies[(len(latencies)*99)/100].Latency
Logf("10%% %s: %v", header, latencies[(len(latencies)*9)/10:len(latencies)])
Logf("perc50: %v, perc90: %v, perc99: %v", perc50, perc90, perc99)
}
// This test suite can take a long time to run, so by default it is added to
// the ginkgo.skip list (see driver.go).
// To run this suite you must explicitly ask for it by setting the
@ -45,6 +66,7 @@ var _ = Describe("Density", func() {
var c *client.Client
var minionCount int
var RCName string
var additionalRCName string
var ns string
var uuid string
@ -82,6 +104,13 @@ var _ = Describe("Density", func() {
expectNoError(err)
}
rc, err = c.ReplicationControllers(ns).Get(additionalRCName)
if err == nil && rc.Spec.Replicas != 0 {
By("Cleaning up the replication controller")
err := DeleteRC(c, ns, additionalRCName)
expectNoError(err)
}
By(fmt.Sprintf("Destroying namespace for this suite %v", ns))
if err := c.Namespaces().Delete(ns); err != nil {
Failf("Couldn't delete ns %s", err)
@ -99,9 +128,11 @@ var _ = Describe("Density", func() {
// Tests with "Skipped" substring in their name will be skipped when running
// e2e test suite without --ginkgo.focus & --ginkgo.skip flags.
type Density struct {
skip bool
podsPerMinion int
/* Controls how often the apiserver is polled for pods */
skip bool
// Controls if e2e latency tests should be run (they are slow)
runLatencyTest bool
podsPerMinion int
// Controls how often the apiserver is polled for pods
interval time.Duration
}
@ -109,12 +140,13 @@ var _ = Describe("Density", func() {
// This test should not be run in a regular jenkins run, because it is not isolated enough
// (metrics from other tests affects this one).
// TODO: Reenable once we can measure latency only from a single test.
{podsPerMinion: 3, skip: true, interval: 10 * time.Second},
{podsPerMinion: 30, skip: true, interval: 10 * time.Second},
// TODO: Expose runLatencyTest as ginkgo flag.
{podsPerMinion: 3, skip: true, runLatencyTest: false, interval: 10 * time.Second},
{podsPerMinion: 30, skip: true, runLatencyTest: false, interval: 10 * time.Second},
// More than 30 pods per node is outside our v1.0 goals.
// We might want to enable those tests in the future.
{podsPerMinion: 50, skip: true, interval: 10 * time.Second},
{podsPerMinion: 100, skip: true, interval: 1 * time.Second},
{podsPerMinion: 50, skip: true, runLatencyTest: false, interval: 10 * time.Second},
{podsPerMinion: 100, skip: true, runLatencyTest: false, interval: 1 * time.Second},
}
for _, testArg := range densityTests {
@ -188,6 +220,138 @@ var _ = Describe("Density", func() {
// Tune the threshold for allowed failures.
badEvents := BadEvents(events)
Expect(badEvents).NotTo(BeNumerically(">", int(math.Floor(0.01*float64(totalPods)))))
if itArg.runLatencyTest {
Logf("Schedling additional Pods to measure startup latencies")
createTimes := make(map[string]util.Time, 0)
scheduleTimes := make(map[string]util.Time, 0)
runTimes := make(map[string]util.Time, 0)
watchTimes := make(map[string]util.Time, 0)
var mutex sync.Mutex
checkPod := func(p *api.Pod) {
mutex.Lock()
defer mutex.Unlock()
defer GinkgoRecover()
if p.Status.Phase == api.PodRunning {
if _, found := watchTimes[p.Name]; !found {
watchTimes[p.Name] = util.Now()
createTimes[p.Name] = p.CreationTimestamp
var startTime util.Time
for _, cs := range p.Status.ContainerStatuses {
if cs.State.Running != nil {
if startTime.Before(cs.State.Running.StartedAt) {
startTime = cs.State.Running.StartedAt
}
}
}
if startTime != util.NewTime(time.Time{}) {
runTimes[p.Name] = startTime
} else {
Failf("Pod %v is reported to be running, but none of its containers is", p.Name)
}
}
}
}
additionalNameStr := strconv.Itoa(minionCount) + "-" + string(util.NewUUID())
additionalRCName = "my-hostname-latency" + additionalNameStr
_, controller := framework.NewInformer(
&cache.ListWatch{
ListFunc: func() (runtime.Object, error) {
return c.Pods(ns).List(labels.SelectorFromSet(labels.Set{"name": additionalRCName}), fields.Everything())
},
WatchFunc: func(rv string) (watch.Interface, error) {
return c.Pods(ns).Watch(labels.SelectorFromSet(labels.Set{"name": additionalRCName}), fields.Everything(), rv)
},
},
&api.Pod{},
time.Minute*5,
framework.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
p, ok := obj.(*api.Pod)
Expect(ok).To(Equal(true))
go checkPod(p)
},
UpdateFunc: func(oldObj, newObj interface{}) {
p, ok := newObj.(*api.Pod)
Expect(ok).To(Equal(true))
go checkPod(p)
},
},
)
stopCh := make(chan struct{})
go controller.Run(stopCh)
config = RCConfig{Client: c,
Image: "gcr.io/google_containers/pause:go",
Name: additionalRCName,
Namespace: ns,
PollInterval: itArg.interval,
Replicas: minionCount,
}
expectNoError(RunRC(config))
Logf("Waiting for all Pods begin observed by the watch...")
for start := time.Now(); len(watchTimes) < minionCount && time.Since(start) < timeout; time.Sleep(10 * time.Second) {
}
close(stopCh)
schedEvents, err := c.Events(ns).List(
labels.Everything(),
fields.Set{
"involvedObject.kind": "Pod",
"involvedObject.namespace": ns,
"source": "scheduler",
}.AsSelector())
expectNoError(err)
for k := range createTimes {
for _, event := range schedEvents.Items {
if event.InvolvedObject.Name == k {
scheduleTimes[k] = event.FirstTimestamp
break
}
}
}
scheduleLag := make([]podLatencyData, 0)
startupLag := make([]podLatencyData, 0)
watchLag := make([]podLatencyData, 0)
schedToWatchLag := make([]podLatencyData, 0)
e2eLag := make([]podLatencyData, 0)
for name, create := range createTimes {
sched, ok := scheduleTimes[name]
Expect(ok).To(Equal(true))
run, ok := runTimes[name]
Expect(ok).To(Equal(true))
watch, ok := watchTimes[name]
Expect(ok).To(Equal(true))
scheduleLag = append(scheduleLag, podLatencyData{name, sched.Time.Sub(create.Time)})
startupLag = append(startupLag, podLatencyData{name, run.Time.Sub(sched.Time)})
watchLag = append(watchLag, podLatencyData{name, watch.Time.Sub(run.Time)})
schedToWatchLag = append(schedToWatchLag, podLatencyData{name, watch.Time.Sub(sched.Time)})
e2eLag = append(e2eLag, podLatencyData{name, watch.Time.Sub(create.Time)})
}
sort.Sort(latencySlice(scheduleLag))
sort.Sort(latencySlice(startupLag))
sort.Sort(latencySlice(watchLag))
sort.Sort(latencySlice(schedToWatchLag))
sort.Sort(latencySlice(e2eLag))
printLatencies(scheduleLag, "worst schedule latencies")
printLatencies(startupLag, "worst run-after-schedule latencies")
printLatencies(watchLag, "worst watch latencies")
printLatencies(schedToWatchLag, "worst scheduled-to-end total latencies")
printLatencies(e2eLag, "worst e2e total latencies")
Logf("Approx throughput: %v pods/min",
float64(minionCount)/(e2eLag[len(e2eLag)-1].Latency.Minutes()))
}
})
}
})