diff --git a/cmd/kube-apiserver/app/testing/testserver.go b/cmd/kube-apiserver/app/testing/testserver.go
index 6969cb70662..3c3b9e55ecd 100644
--- a/cmd/kube-apiserver/app/testing/testserver.go
+++ b/cmd/kube-apiserver/app/testing/testserver.go
@@ -150,7 +150,11 @@ func NewDefaultTestServerOptions() *TestServerInstanceOptions {
 // files that because Golang testing's call to os.Exit will not give a stop channel go routine
 // enough time to remove temporary files.
 func StartTestServer(t ktesting.TB, instanceOptions *TestServerInstanceOptions, customFlags []string, storageConfig *storagebackend.Config) (result TestServer, err error) {
-	tCtx := ktesting.Init(t)
+	// Some callers may have initialize ktesting already.
+	tCtx, ok := t.(ktesting.TContext)
+	if !ok {
+		tCtx = ktesting.Init(t)
+	}
 
 	if instanceOptions == nil {
 		instanceOptions = NewDefaultTestServerOptions()
diff --git a/test/integration/framework/etcd.go b/test/integration/framework/etcd.go
index d816f623dca..9f2f4b943c2 100644
--- a/test/integration/framework/etcd.go
+++ b/test/integration/framework/etcd.go
@@ -26,6 +26,7 @@ import (
 	"os/exec"
 	"strconv"
 	"strings"
+	"sync"
 	"syscall"
 	"testing"
 	"time"
@@ -146,7 +147,10 @@ func RunCustomEtcd(dataDir string, customFlags []string, output io.Writer) (url
 		// try to exit etcd gracefully
 		defer cancel()
 		cmd.Process.Signal(syscall.SIGTERM)
+		var wg sync.WaitGroup
+		wg.Add(1)
 		go func() {
+			defer wg.Done()
 			select {
 			case <-ctx.Done():
 				klog.Infof("etcd exited gracefully, context cancelled")
@@ -156,6 +160,7 @@ func RunCustomEtcd(dataDir string, customFlags []string, output io.Writer) (url
 			}
 		}()
 		err := cmd.Wait()
+		wg.Wait()
 		klog.Infof("etcd exit status: %v", err)
 		err = os.RemoveAll(etcdDataDir)
 		if err != nil {
diff --git a/test/integration/framework/perf_utils.go b/test/integration/framework/perf_utils.go
index bb1d2704b82..566308355fc 100644
--- a/test/integration/framework/perf_utils.go
+++ b/test/integration/framework/perf_utils.go
@@ -18,6 +18,7 @@ package framework
 
 import (
 	"context"
+	"fmt"
 
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -109,20 +110,19 @@ func (p *IntegrationTestNodePreparer) PrepareNodes(ctx context.Context, nextNode
 			}
 		}
 		if err != nil {
-			klog.Fatalf("Error creating node: %v", err)
+			return fmt.Errorf("creating node: %w", err)
 		}
 	}
 
 	nodes, err := waitListAllNodes(p.client)
 	if err != nil {
-		klog.Fatalf("Error listing nodes: %v", err)
+		return fmt.Errorf("listing nodes: %w", err)
 	}
 	index := nextNodeIndex
 	for _, v := range p.countToStrategy {
 		for i := 0; i < v.Count; i, index = i+1, index+1 {
 			if err := testutils.DoPrepareNode(ctx, p.client, &nodes.Items[index], v.Strategy); err != nil {
-				klog.Errorf("Aborting node preparation: %v", err)
-				return err
+				return fmt.Errorf("aborting node preparation: %w", err)
 			}
 		}
 	}
diff --git a/test/integration/scheduler_perf/README.md b/test/integration/scheduler_perf/README.md
index 46d7423cad3..682c7afd991 100644
--- a/test/integration/scheduler_perf/README.md
+++ b/test/integration/scheduler_perf/README.md
@@ -102,6 +102,15 @@ performance.
 During interactive debugging sessions it is possible to enable per-test output
 via -use-testing-log.
 
+Log output can be quite large, in particular when running the large benchmarks
+and when not using -use-testing-log. For benchmarks, we want to produce that
+log output in a realistic way (= write to disk using the normal logging
+backends) and only capture the output of a specific test as part of the job
+results when that test failed. Therefore each test redirects its own output if
+the ARTIFACTS env variable is set to a `$ARTIFACTS/<test name>.log` file and
+removes that file only if the test passed.
+
+
 ### Integration tests
 
 To run integration tests, use:
diff --git a/test/integration/scheduler_perf/main_test.go b/test/integration/scheduler_perf/main_test.go
index 86a4eaa337d..632693f8765 100644
--- a/test/integration/scheduler_perf/main_test.go
+++ b/test/integration/scheduler_perf/main_test.go
@@ -23,23 +23,15 @@ import (
 	"strings"
 	"testing"
 
-	"k8s.io/apimachinery/pkg/util/runtime"
-	"k8s.io/component-base/featuregate"
 	"k8s.io/component-base/logs"
 	logsapi "k8s.io/component-base/logs/api/v1"
 	_ "k8s.io/component-base/logs/json/register"
 	"k8s.io/kubernetes/test/utils/ktesting"
 )
 
-// Run with -v=2, this is the default log level in production.
-//
-// In a PR this can be bumped up temporarily to run pull-kubernetes-scheduler-perf
-// with more log output.
-const defaultVerbosity = 2
-
 func TestMain(m *testing.M) {
 	// Run with -v=2, this is the default log level in production.
-	ktesting.SetDefaultVerbosity(defaultVerbosity)
+	ktesting.SetDefaultVerbosity(DefaultLoggingVerbosity)
 
 	// test/integration/framework/flags.go unconditionally initializes the
 	// logging flags. That's correct for most tests, but in the
@@ -56,21 +48,17 @@ func TestMain(m *testing.M) {
 	})
 	flag.CommandLine = &fs
 
-	featureGate := featuregate.NewFeatureGate()
-	runtime.Must(logsapi.AddFeatureGates(featureGate))
-	flag.Var(featureGate, "feature-gate",
+	flag.Var(LoggingFeatureGate, "feature-gate",
 		"A set of key=value pairs that describe feature gates for alpha/experimental features. "+
-			"Options are:\n"+strings.Join(featureGate.KnownFeatures(), "\n"))
-	c := logsapi.NewLoggingConfiguration()
-	c.Verbosity = defaultVerbosity
+			"Options are:\n"+strings.Join(LoggingFeatureGate.KnownFeatures(), "\n"))
 
 	// This would fail if we hadn't removed the logging flags above.
-	logsapi.AddGoFlags(c, flag.CommandLine)
+	logsapi.AddGoFlags(LoggingConfig, flag.CommandLine)
 
 	flag.Parse()
 
 	logs.InitLogs()
-	if err := logsapi.ValidateAndApply(c, featureGate); err != nil {
+	if err := logsapi.ValidateAndApply(LoggingConfig, LoggingFeatureGate); err != nil {
 		fmt.Fprintf(os.Stderr, "%v\n", err)
 		os.Exit(1)
 	}
diff --git a/test/integration/scheduler_perf/scheduler_perf.go b/test/integration/scheduler_perf/scheduler_perf.go
index af571b21b0f..d08c643b172 100644
--- a/test/integration/scheduler_perf/scheduler_perf.go
+++ b/test/integration/scheduler_perf/scheduler_perf.go
@@ -26,6 +26,7 @@ import (
 	"math"
 	"os"
 	"path"
+	"regexp"
 	"strings"
 	"sync"
 	"testing"
@@ -38,6 +39,7 @@ import (
 	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/apimachinery/pkg/util/wait"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	cacheddiscovery "k8s.io/client-go/discovery/cached/memory"
@@ -48,6 +50,7 @@ import (
 	"k8s.io/client-go/restmapper"
 	"k8s.io/component-base/featuregate"
 	featuregatetesting "k8s.io/component-base/featuregate/testing"
+	logsapi "k8s.io/component-base/logs/api/v1"
 	"k8s.io/component-base/metrics/legacyregistry"
 	"k8s.io/klog/v2"
 	"k8s.io/kubernetes/pkg/scheduler/apis/config"
@@ -94,6 +97,29 @@ const (
 	pluginLabelName          = "plugin"
 )
 
+// Run with -v=2, this is the default log level in production.
+//
+// In a PR this can be bumped up temporarily to run pull-kubernetes-scheduler-perf
+// with more log output.
+const DefaultLoggingVerbosity = 2
+
+var LoggingFeatureGate FeatureGateFlag
+var LoggingConfig *logsapi.LoggingConfiguration
+
+type FeatureGateFlag interface {
+	featuregate.FeatureGate
+	flag.Value
+}
+
+func init() {
+	f := featuregate.NewFeatureGate()
+	runtime.Must(logsapi.AddFeatureGates(f))
+	LoggingFeatureGate = f
+
+	LoggingConfig = logsapi.NewLoggingConfiguration()
+	LoggingConfig.Verbosity = DefaultLoggingVerbosity
+}
+
 var (
 	defaultMetricsCollectorConfig = metricsCollectorConfig{
 		Metrics: map[string][]*labelValues{
@@ -760,8 +786,62 @@ func initTestOutput(tb testing.TB) io.Writer {
 
 var perfSchedulingLabelFilter = flag.String("perf-scheduling-label-filter", "performance", "comma-separated list of labels which a testcase must have (no prefix or +) or must not have (-), used by BenchmarkPerfScheduling")
 
+var specialFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9-_]`)
+
 func setupTestCase(t testing.TB, tc *testCase, output io.Writer, outOfTreePluginRegistry frameworkruntime.Registry) (informers.SharedInformerFactory, ktesting.TContext) {
 	tCtx := ktesting.Init(t, initoption.PerTestOutput(*useTestingLog))
+	artifacts, doArtifacts := os.LookupEnv("ARTIFACTS")
+	if !*useTestingLog && doArtifacts {
+		// Reconfigure logging so that it goes to a separate file per
+		// test instead of stderr. If the test passes, the file gets
+		// deleted. The overall output can be very large (> 200 MB for
+		// ci-benchmark-scheduler-perf-master). With this approach, we
+		// have log output for failures without having to store large
+		// amounts of data that no-one is looking at. The performance
+		// is the same as writing to stderr.
+		if err := logsapi.ResetForTest(LoggingFeatureGate); err != nil {
+			t.Fatalf("Failed to reset the logging configuration: %v", err)
+		}
+		logfileName := path.Join(artifacts, specialFilenameChars.ReplaceAllString(t.Name(), "_")+".log")
+		out, err := os.Create(logfileName)
+		if err != nil {
+			t.Fatalf("Failed to create per-test log output file: %v", err)
+		}
+		t.Cleanup(func() {
+			// Everything should have stopped by now, checked below
+			// by GoleakCheck (which runs first during test
+			// shutdown!). Therefore we can clean up. Errors get logged
+			// and fail the test, but cleanup tries to continue.
+			//
+			// Note that the race detector will flag any goroutine
+			// as causing a race if there is no explicit wait for
+			// that goroutine to stop.  We know that they must have
+			// stopped (GoLeakCheck!) but the race detector
+			// doesn't.
+			//
+			// This is a major issue because many Kubernetes goroutines get
+			// started without waiting for them to stop :-(
+			if err := logsapi.ResetForTest(LoggingFeatureGate); err != nil {
+				t.Errorf("Failed to reset the logging configuration: %v", err)
+			}
+			if err := out.Close(); err != nil {
+				t.Errorf("Failed to close the per-test log output file: %s: %v", logfileName, err)
+			}
+			if !t.Failed() {
+				if err := os.Remove(logfileName); err != nil {
+					t.Errorf("Failed to remove the per-test log output file: %v", err)
+				}
+			}
+		})
+		opts := &logsapi.LoggingOptions{
+			ErrorStream: out,
+			InfoStream:  out,
+		}
+		if err := logsapi.ValidateAndApplyWithOptions(LoggingConfig, opts, LoggingFeatureGate); err != nil {
+			t.Fatalf("Failed to apply the per-test logging configuration: %v", err)
+		}
+
+	}
 
 	// Ensure that there are no leaked
 	// goroutines.  They could influence
diff --git a/test/utils/ktesting/contexthelper.go b/test/utils/ktesting/contexthelper.go
index 28d2b245934..bcfdd4d284e 100644
--- a/test/utils/ktesting/contexthelper.go
+++ b/test/utils/ktesting/contexthelper.go
@@ -59,6 +59,14 @@ func withTimeout(ctx context.Context, tb TB, timeout time.Duration, timeoutCause
 			// No need to set a cause here. The cause or error of
 			// the parent context will be used.
 		case <-after.C:
+			// Code using this tCtx may or may not log the
+			// information above when it runs into the
+			// cancellation. It's better if we do it, just to be on
+			// the safe side.
+			//
+			// Would be nice to log this with the source code location
+			// of our caller, but testing.Logf does not support that.
+			tb.Logf("\nWARNING: %s\n", timeoutCause)
 			cancel(canceledError(timeoutCause))
 		}
 	}()