Merge pull request #24536 from jayunit100/LoggingSoak

Automatic merge from submit-queue

Logging soak

Implements #24427 

Needs 

- #24471 so that it doesnt clog test outputs for scale
- builds on the utils function added in support of #22869 

cc @timothysc @kubernetes/sig-testing
This commit is contained in:
Kubernetes Submit Queue 2016-09-23 16:06:47 -07:00 committed by GitHub
commit 5374e48e60
3 changed files with 161 additions and 19 deletions

View File

@ -27,24 +27,6 @@ import (
. "github.com/onsi/ginkgo"
)
// returns maxRetries, sleepDuration
func readConfig() (int, time.Duration) {
// Read in configuration settings, reasonable defaults.
retry := framework.TestContext.Cadvisor.MaxRetries
if framework.TestContext.Cadvisor.MaxRetries == 0 {
retry = 6
framework.Logf("Overriding default retry value of zero to %d", retry)
}
sleepDurationMS := framework.TestContext.Cadvisor.SleepDurationMS
if sleepDurationMS == 0 {
sleepDurationMS = 10000
framework.Logf("Overriding default milliseconds value of zero to %d", sleepDurationMS)
}
return retry, time.Duration(sleepDurationMS) * time.Millisecond
}
var _ = framework.KubeDescribe("Cadvisor", func() {
f := framework.NewDefaultFramework("cadvisor")
@ -60,6 +42,25 @@ func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) {
nodeList, err := c.Nodes().List(api.ListOptions{})
framework.ExpectNoError(err)
var errors []error
// returns maxRetries, sleepDuration
readConfig := func() (int, time.Duration) {
// Read in configuration settings, reasonable defaults.
retry := framework.TestContext.Cadvisor.MaxRetries
if framework.TestContext.Cadvisor.MaxRetries == 0 {
retry = 6
framework.Logf("Overriding default retry value of zero to %d", retry)
}
sleepDurationMS := framework.TestContext.Cadvisor.SleepDurationMS
if sleepDurationMS == 0 {
sleepDurationMS = 10000
framework.Logf("Overriding default milliseconds value of zero to %d", sleepDurationMS)
}
return retry, time.Duration(sleepDurationMS) * time.Millisecond
}
maxRetries, sleepDuration := readConfig()
for {
errors = []error{}

View File

@ -79,6 +79,7 @@ type TestContextType struct {
NodeTestContextType
// Viper-only parameters. These will in time replace all flags.
// Example: Create a file 'e2e.json' with the following:
// "Cadvisor":{
// "MaxRetries":"6"
@ -89,6 +90,11 @@ type TestContextType struct {
MaxRetries int
SleepDurationMS int
}
LoggingSoak struct {
Scale int
MilliSecondsBetweenWaves int
}
}
// NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
@ -205,7 +211,6 @@ func RegisterNodeFlags() {
// Enable viper configuration management of flags.
func ViperizeFlags() {
// TODO @jayunit100: Maybe a more elegant viper-flag integration for the future?
// For now, we layer it on top, because 'flag' deps of 'go test' make pflag wrappers
// fragile, seeming to force 'flag' to have deep awareness of pflag params.

136
test/e2e/logging_soak.go Normal file
View File

@ -0,0 +1,136 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
import (
"fmt"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/test/e2e/framework"
"strconv"
"strings"
"sync"
"time"
)
var _ = framework.KubeDescribe("Logging soak [Performance] [Slow] [Disruptive]", func() {
f := framework.NewDefaultFramework("logging-soak")
// Not a global constant (irrelevant outside this test), also not a parameter (if you want more logs, use --scale=).
kbRateInSeconds := 1 * time.Second
totalLogTime := 2 * time.Minute
// This test is designed to run and confirm that logs are being generated at a large scale, and that they can be grabbed by the kubelet.
// By running it repeatedly in the background, you can simulate large collections of chatty containers.
// This can expose problems in your docker configuration (logging), log searching infrastructure, to tune deployments to match high load
// scenarios. TODO jayunit100 add this to the kube CI in a follow on infra patch.
// Returns scale (how many waves of pods).
// Returns wave interval (how many seconds to wait before dumping the next wave of pods).
readConfig := func() (int, time.Duration) {
// Read in configuration settings, reasonable defaults.
scale := framework.TestContext.LoggingSoak.Scale
if framework.TestContext.LoggingSoak.Scale == 0 {
scale = 1
framework.Logf("Overriding default scale value of zero to %d", scale)
}
milliSecondsBetweenWaves := framework.TestContext.LoggingSoak.MilliSecondsBetweenWaves
if milliSecondsBetweenWaves == 0 {
milliSecondsBetweenWaves = 5000
framework.Logf("Overriding default milliseconds value of zero to %d", milliSecondsBetweenWaves)
}
return scale, time.Duration(milliSecondsBetweenWaves) * time.Millisecond
}
scale, millisecondsBetweenWaves := readConfig()
It(fmt.Sprintf("should survive logging 1KB every %v seconds, for a duration of %v, scaling up to %v pods per node", kbRateInSeconds, totalLogTime, scale), func() {
defer GinkgoRecover()
var wg sync.WaitGroup
wg.Add(scale)
for i := 0; i < scale; i++ {
go func() {
wave := fmt.Sprintf("wave%v", strconv.Itoa(i))
framework.Logf("Starting logging soak, wave = %v", wave)
RunLogPodsWithSleepOf(f, kbRateInSeconds, wave, totalLogTime)
framework.Logf("Completed logging soak, wave %v", i)
wg.Done()
}()
// Niceness.
time.Sleep(millisecondsBetweenWaves)
}
framework.Logf("Waiting on all %v logging soak waves to complete", scale)
wg.Wait()
})
})
// RunLogPodsWithSleepOf creates a pod on every node, logs continuously (with "sleep" pauses), and verifies that the log string
// was produced in each and every pod at least once. The final arg is the timeout for the test to verify all the pods got logs.
func RunLogPodsWithSleepOf(f *framework.Framework, sleep time.Duration, podname string, timeout time.Duration) {
nodes := framework.GetReadySchedulableNodesOrDie(f.Client)
totalPods := len(nodes.Items)
Expect(totalPods).NotTo(Equal(0))
kilobyte := strings.Repeat("logs-123", 128) // 8*128=1024 = 1KB of text.
appName := "logging-soak" + podname
podlables := f.CreatePodsPerNodeForSimpleApp(
appName,
func(n api.Node) api.PodSpec {
return api.PodSpec{
Containers: []api.Container{{
Name: "logging-soak",
Image: "gcr.io/google_containers/busybox:1.24",
Args: []string{
"/bin/sh",
"-c",
fmt.Sprintf("while true ; do echo %v ; sleep %v; done", kilobyte, sleep.Seconds()),
},
}},
NodeName: n.Name,
RestartPolicy: api.RestartPolicyAlways,
}
},
totalPods,
)
logSoakVerification := f.NewClusterVerification(
framework.PodStateVerification{
Selectors: podlables,
ValidPhases: []api.PodPhase{api.PodRunning, api.PodSucceeded},
// we don't validate total log data, since there is no gaurantee all logs will be stored forever.
// instead, we just validate that some logs are being created in std out.
Verify: func(p api.Pod) (bool, error) {
s, err := framework.LookForStringInLog(f.Namespace.Name, p.Name, "logging-soak", "logs-123", 1*time.Second)
return s != "", err
},
},
)
largeClusterForgiveness := time.Duration(len(nodes.Items)/5) * time.Second // i.e. a 100 node cluster gets an extra 20 seconds to complete.
pods, err := logSoakVerification.WaitFor(totalPods, timeout+largeClusterForgiveness)
if err != nil {
framework.Failf("Error in wait... %v", err)
} else if len(pods) < totalPods {
framework.Failf("Only got %v out of %v", len(pods), totalPods)
}
}