Merge pull request #24536 from jayunit100/LoggingSoak

Automatic merge from submit-queue Logging soak Implements #24427 Needs - #24471 so that it doesnt clog test outputs for scale - builds on the utils function added in support of #22869 cc @timothysc @kubernetes/sig-testing
2025-09-17 15:13:08 +00:00 · 2016-09-23 16:06:47 -07:00
parent 501f264717 fe5429e921
commit 5374e48e60
3 changed files with 161 additions and 19 deletions
--- a/test/e2e/cadvisor.go
+++ b/test/e2e/cadvisor.go
@@ -27,24 +27,6 @@ import (
 	. "github.com/onsi/ginkgo"
 )

-// returns maxRetries, sleepDuration
-func readConfig() (int, time.Duration) {
-	// Read in configuration settings, reasonable defaults.
-	retry := framework.TestContext.Cadvisor.MaxRetries
-	if framework.TestContext.Cadvisor.MaxRetries == 0 {
-		retry = 6
-		framework.Logf("Overriding default retry value of zero to %d", retry)
-	}
-
-	sleepDurationMS := framework.TestContext.Cadvisor.SleepDurationMS
-	if sleepDurationMS == 0 {
-		sleepDurationMS = 10000
-		framework.Logf("Overriding default milliseconds value of zero to %d", sleepDurationMS)
-	}
-
-	return retry, time.Duration(sleepDurationMS) * time.Millisecond
-}
-
 var _ = framework.KubeDescribe("Cadvisor", func() {

 	f := framework.NewDefaultFramework("cadvisor")
@@ -60,6 +42,25 @@ func CheckCadvisorHealthOnAllNodes(c *client.Client, timeout time.Duration) {
 	nodeList, err := c.Nodes().List(api.ListOptions{})
 	framework.ExpectNoError(err)
 	var errors []error
+
+	// returns maxRetries, sleepDuration
+	readConfig := func() (int, time.Duration) {
+		// Read in configuration settings, reasonable defaults.
+		retry := framework.TestContext.Cadvisor.MaxRetries
+		if framework.TestContext.Cadvisor.MaxRetries == 0 {
+			retry = 6
+			framework.Logf("Overriding default retry value of zero to %d", retry)
+		}
+
+		sleepDurationMS := framework.TestContext.Cadvisor.SleepDurationMS
+		if sleepDurationMS == 0 {
+			sleepDurationMS = 10000
+			framework.Logf("Overriding default milliseconds value of zero to %d", sleepDurationMS)
+		}
+
+		return retry, time.Duration(sleepDurationMS) * time.Millisecond
+	}
+
 	maxRetries, sleepDuration := readConfig()
 	for {
 		errors = []error{}
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@@ -79,6 +79,7 @@ type TestContextType struct {
 	NodeTestContextType

 	// Viper-only parameters.  These will in time replace all flags.
+
 	// Example: Create a file 'e2e.json' with the following:
 	// 	"Cadvisor":{
 	// 		"MaxRetries":"6"
@@ -89,6 +90,11 @@ type TestContextType struct {
 		MaxRetries      int
 		SleepDurationMS int
 	}
+
+	LoggingSoak struct {
+		Scale                    int
+		MilliSecondsBetweenWaves int
+	}
 }

 // NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
@@ -205,7 +211,6 @@ func RegisterNodeFlags() {

 // Enable viper configuration management of flags.
 func ViperizeFlags() {
-
 	// TODO @jayunit100: Maybe a more elegant viper-flag integration for the future?
 	// For now, we layer it on top, because 'flag' deps of 'go test' make pflag wrappers
 	// fragile, seeming to force 'flag' to have deep awareness of pflag params.
--- a/test/e2e/logging_soak.go
+++ b/test/e2e/logging_soak.go
@@ -0,0 +1,136 @@
+/*
+Copyright 2016 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"fmt"
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/test/e2e/framework"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+var _ = framework.KubeDescribe("Logging soak [Performance] [Slow] [Disruptive]", func() {
+
+	f := framework.NewDefaultFramework("logging-soak")
+
+	// Not a global constant (irrelevant outside this test), also not a parameter (if you want more logs, use --scale=).
+	kbRateInSeconds := 1 * time.Second
+	totalLogTime := 2 * time.Minute
+
+	// This test is designed to run and confirm that logs are being generated at a large scale, and that they can be grabbed by the kubelet.
+	// By running it repeatedly in the background, you can simulate large collections of chatty containers.
+	// This can expose problems in your docker configuration (logging), log searching infrastructure, to tune deployments to match high load
+	// scenarios.  TODO jayunit100 add this to the kube CI in a follow on infra patch.
+
+	// Returns scale (how many waves of pods).
+	// Returns wave interval (how many seconds to wait before dumping the next wave of pods).
+	readConfig := func() (int, time.Duration) {
+		// Read in configuration settings, reasonable defaults.
+		scale := framework.TestContext.LoggingSoak.Scale
+		if framework.TestContext.LoggingSoak.Scale == 0 {
+			scale = 1
+			framework.Logf("Overriding default scale value of zero to %d", scale)
+		}
+
+		milliSecondsBetweenWaves := framework.TestContext.LoggingSoak.MilliSecondsBetweenWaves
+		if milliSecondsBetweenWaves == 0 {
+			milliSecondsBetweenWaves = 5000
+			framework.Logf("Overriding default milliseconds value of zero to %d", milliSecondsBetweenWaves)
+		}
+
+		return scale, time.Duration(milliSecondsBetweenWaves) * time.Millisecond
+	}
+
+	scale, millisecondsBetweenWaves := readConfig()
+	It(fmt.Sprintf("should survive logging 1KB every %v seconds, for a duration of %v, scaling up to %v pods per node", kbRateInSeconds, totalLogTime, scale), func() {
+		defer GinkgoRecover()
+		var wg sync.WaitGroup
+		wg.Add(scale)
+		for i := 0; i < scale; i++ {
+			go func() {
+				wave := fmt.Sprintf("wave%v", strconv.Itoa(i))
+				framework.Logf("Starting logging soak, wave = %v", wave)
+				RunLogPodsWithSleepOf(f, kbRateInSeconds, wave, totalLogTime)
+				framework.Logf("Completed logging soak, wave %v", i)
+				wg.Done()
+			}()
+			// Niceness.
+			time.Sleep(millisecondsBetweenWaves)
+		}
+		framework.Logf("Waiting on all %v logging soak waves to complete", scale)
+		wg.Wait()
+	})
+})
+
+// RunLogPodsWithSleepOf creates a pod on every node, logs continuously (with "sleep" pauses), and verifies that the log string
+// was produced in each and every pod at least once.  The final arg is the timeout for the test to verify all the pods got logs.
+func RunLogPodsWithSleepOf(f *framework.Framework, sleep time.Duration, podname string, timeout time.Duration) {
+
+	nodes := framework.GetReadySchedulableNodesOrDie(f.Client)
+	totalPods := len(nodes.Items)
+	Expect(totalPods).NotTo(Equal(0))
+
+	kilobyte := strings.Repeat("logs-123", 128) // 8*128=1024 = 1KB of text.
+
+	appName := "logging-soak" + podname
+	podlables := f.CreatePodsPerNodeForSimpleApp(
+		appName,
+		func(n api.Node) api.PodSpec {
+			return api.PodSpec{
+				Containers: []api.Container{{
+					Name:  "logging-soak",
+					Image: "gcr.io/google_containers/busybox:1.24",
+					Args: []string{
+						"/bin/sh",
+						"-c",
+						fmt.Sprintf("while true ; do echo %v ; sleep %v; done", kilobyte, sleep.Seconds()),
+					},
+				}},
+				NodeName:      n.Name,
+				RestartPolicy: api.RestartPolicyAlways,
+			}
+		},
+		totalPods,
+	)
+
+	logSoakVerification := f.NewClusterVerification(
+		framework.PodStateVerification{
+			Selectors:   podlables,
+			ValidPhases: []api.PodPhase{api.PodRunning, api.PodSucceeded},
+			// we don't validate total log data, since there is no gaurantee all logs will be stored forever.
+			// instead, we just validate that some logs are being created in std out.
+			Verify: func(p api.Pod) (bool, error) {
+				s, err := framework.LookForStringInLog(f.Namespace.Name, p.Name, "logging-soak", "logs-123", 1*time.Second)
+				return s != "", err
+			},
+		},
+	)
+
+	largeClusterForgiveness := time.Duration(len(nodes.Items)/5) * time.Second // i.e. a 100 node cluster gets an extra 20 seconds to complete.
+	pods, err := logSoakVerification.WaitFor(totalPods, timeout+largeClusterForgiveness)
+
+	if err != nil {
+		framework.Failf("Error in wait... %v", err)
+	} else if len(pods) < totalPods {
+		framework.Failf("Only got %v out of %v", len(pods), totalPods)
+	}
+}