scheduler_perf: only store log output after failures

Reconfiguring the logging infrastructure with a per-test output file mimicks
the behavior of per-test output (log output captured only on failures) while
still using the normal logging code, which is important for benchmarking.

To enable this behavior, the ARTIFACT env variable must be set.
This commit is contained in:
Patrick Ohly 2024-08-23 14:28:43 +02:00
parent 6cbdef850c
commit bf1188d292
3 changed files with 94 additions and 17 deletions

View File

@ -102,6 +102,15 @@ performance.
During interactive debugging sessions it is possible to enable per-test output
via -use-testing-log.
Log output can be quite large, in particular when running the large benchmarks
and when not using -use-testing-log. For benchmarks, we want to produce that
log output in a realistic way (= write to disk using the normal logging
backends) and only capture the output of a specific test as part of the job
results when that test failed. Therefore each test redirects its own output if
the ARTIFACTS env variable is set to a `$ARTIFACTS/<test name>.log` file and
removes that file only if the test passed.
### Integration tests
To run integration tests, use:

View File

@ -23,23 +23,15 @@ import (
"strings"
"testing"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/component-base/featuregate"
"k8s.io/component-base/logs"
logsapi "k8s.io/component-base/logs/api/v1"
_ "k8s.io/component-base/logs/json/register"
"k8s.io/kubernetes/test/utils/ktesting"
)
// Run with -v=2, this is the default log level in production.
//
// In a PR this can be bumped up temporarily to run pull-kubernetes-scheduler-perf
// with more log output.
const defaultVerbosity = 2
func TestMain(m *testing.M) {
// Run with -v=2, this is the default log level in production.
ktesting.SetDefaultVerbosity(defaultVerbosity)
ktesting.SetDefaultVerbosity(DefaultLoggingVerbosity)
// test/integration/framework/flags.go unconditionally initializes the
// logging flags. That's correct for most tests, but in the
@ -56,21 +48,17 @@ func TestMain(m *testing.M) {
})
flag.CommandLine = &fs
featureGate := featuregate.NewFeatureGate()
runtime.Must(logsapi.AddFeatureGates(featureGate))
flag.Var(featureGate, "feature-gate",
flag.Var(LoggingFeatureGate, "feature-gate",
"A set of key=value pairs that describe feature gates for alpha/experimental features. "+
"Options are:\n"+strings.Join(featureGate.KnownFeatures(), "\n"))
c := logsapi.NewLoggingConfiguration()
c.Verbosity = defaultVerbosity
"Options are:\n"+strings.Join(LoggingFeatureGate.KnownFeatures(), "\n"))
// This would fail if we hadn't removed the logging flags above.
logsapi.AddGoFlags(c, flag.CommandLine)
logsapi.AddGoFlags(LoggingConfig, flag.CommandLine)
flag.Parse()
logs.InitLogs()
if err := logsapi.ValidateAndApply(c, featureGate); err != nil {
if err := logsapi.ValidateAndApply(LoggingConfig, LoggingFeatureGate); err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}

View File

@ -26,6 +26,7 @@ import (
"math"
"os"
"path"
"regexp"
"strings"
"sync"
"testing"
@ -38,6 +39,7 @@ import (
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
cacheddiscovery "k8s.io/client-go/discovery/cached/memory"
@ -48,6 +50,7 @@ import (
"k8s.io/client-go/restmapper"
"k8s.io/component-base/featuregate"
featuregatetesting "k8s.io/component-base/featuregate/testing"
logsapi "k8s.io/component-base/logs/api/v1"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
@ -94,6 +97,29 @@ const (
pluginLabelName = "plugin"
)
// Run with -v=2, this is the default log level in production.
//
// In a PR this can be bumped up temporarily to run pull-kubernetes-scheduler-perf
// with more log output.
const DefaultLoggingVerbosity = 2
var LoggingFeatureGate FeatureGateFlag
var LoggingConfig *logsapi.LoggingConfiguration
type FeatureGateFlag interface {
featuregate.FeatureGate
flag.Value
}
func init() {
f := featuregate.NewFeatureGate()
runtime.Must(logsapi.AddFeatureGates(f))
LoggingFeatureGate = f
LoggingConfig = logsapi.NewLoggingConfiguration()
LoggingConfig.Verbosity = DefaultLoggingVerbosity
}
var (
defaultMetricsCollectorConfig = metricsCollectorConfig{
Metrics: map[string][]*labelValues{
@ -764,8 +790,62 @@ func initTestOutput(tb testing.TB) io.Writer {
var perfSchedulingLabelFilter = flag.String("perf-scheduling-label-filter", "performance", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by BenchmarkPerfScheduling")
var specialFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9-_]`)
func setupTestCase(t testing.TB, tc *testCase, output io.Writer, outOfTreePluginRegistry frameworkruntime.Registry) (informers.SharedInformerFactory, ktesting.TContext) {
tCtx := ktesting.Init(t, initoption.PerTestOutput(*useTestingLog))
artifacts, doArtifacts := os.LookupEnv("ARTIFACTS")
if !*useTestingLog && doArtifacts {
// Reconfigure logging so that it goes to a separate file per
// test instead of stderr. If the test passes, the file gets
// deleted. The overall output can be very large (> 200 MB for
// ci-benchmark-scheduler-perf-master). With this approach, we
// have log output for failures without having to store large
// amounts of data that no-one is looking at. The performance
// is the same as writing to stderr.
if err := logsapi.ResetForTest(LoggingFeatureGate); err != nil {
t.Fatalf("Failed to reset the logging configuration: %v", err)
}
logfileName := path.Join(artifacts, specialFilenameChars.ReplaceAllString(t.Name(), "_")+".log")
out, err := os.Create(logfileName)
if err != nil {
t.Fatalf("Failed to create per-test log output file: %v", err)
}
t.Cleanup(func() {
// Everything should have stopped by now, checked below
// by GoleakCheck (which runs first during test
// shutdown!). Therefore we can clean up. Errors get logged
// and fail the test, but cleanup tries to continue.
//
// Note that the race detector will flag any goroutine
// as causing a race if there is no explicit wait for
// that goroutine to stop. We know that they must have
// stopped (GoLeakCheck!) but the race detector
// doesn't.
//
// This is a major issue because many Kubernetes goroutines get
// started without waiting for them to stop :-(
if err := logsapi.ResetForTest(LoggingFeatureGate); err != nil {
t.Errorf("Failed to reset the logging configuration: %v", err)
}
if err := out.Close(); err != nil {
t.Errorf("Failed to close the per-test log output file: %s: %v", logfileName, err)
}
if !t.Failed() {
if err := os.Remove(logfileName); err != nil {
t.Errorf("Failed to remove the per-test log output file: %v", err)
}
}
})
opts := &logsapi.LoggingOptions{
ErrorStream: out,
InfoStream: out,
}
if err := logsapi.ValidateAndApplyWithOptions(LoggingConfig, opts, LoggingFeatureGate); err != nil {
t.Fatalf("Failed to apply the per-test logging configuration: %v", err)
}
}
// Ensure that there are no leaked
// goroutines. They could influence