From 97a8d72a678c75a5050f3f2a61e33ac04a1074ca Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Wed, 21 Dec 2022 11:51:11 +0100
Subject: [PATCH] test/integration/logs: update benchmark support

When trying again with recent log files from the CI job, it was found that some
JSON messages get split across multiple lines, both in container logs and in
the systemd journal:

   2022-12-21T07:09:47.914739996Z stderr F {"ts":1671606587914.691,"caller":"rest/request.go:1169","msg":"Response ...
   2022-12-21T07:09:47.914984628Z stderr F 70 72  6f 78 79 10 01 1a 13 53 ... \".|\n","v":8}

Note the different time stamp on the second line. That first line is
long (17384 bytes). This seems to happen because the data must pass through a
stream-oriented pipe and thus may get split up by the Linux kernel.

The implication is that lines must get merged whenever the JSON decoder
encounters an incomplete line. The benchmark loader now supports that. To
simplifies this, stripping the non-JSON line prefixes must be done before using
a log as test data.

The updated README explains how to do that when downloading a CI job
result. The amount of manual work gets reduced by committing symlinks under
data to the expected location under ci-kubernetes-kind-e2e-json-logging and
ignoring them when the data is not there.

Support for symlinks gets removed and path/filepath is used instead of path
because it has better Windows support.
---
 test/integration/logs/benchmark/.gitignore    | 10 +++
 test/integration/logs/benchmark/README.md     | 24 +++++-
 .../logs/benchmark/benchmark_test.go          | 24 +++---
 .../logs/benchmark/data/container.log         |  2 +-
 .../integration/logs/benchmark/data/split.log |  3 +
 test/integration/logs/benchmark/get-logs.sh   | 73 +++++++++++++++++++
 test/integration/logs/benchmark/load.go       | 37 ++++++----
 test/integration/logs/benchmark/load_test.go  | 25 +++++++
 8 files changed, 170 insertions(+), 28 deletions(-)
 create mode 100644 test/integration/logs/benchmark/.gitignore
 create mode 100644 test/integration/logs/benchmark/data/split.log
 create mode 100755 test/integration/logs/benchmark/get-logs.sh
diff --git a/test/integration/logs/benchmark/.gitignore b/test/integration/logs/benchmark/.gitignore
new file mode 100644
index 00000000000..16ae08cff0b
--- /dev/null
+++ b/test/integration/logs/benchmark/.gitignore
@@ -0,0 +1,10 @@
+# Created by get-logs.sh:
+/ci-kubernetes-kind-e2e-json-logging
+/data/kind-worker-kubelet.log
+/data/kube-apiserver.log
+/data/kube-controller-manager.log
+/data/kube-scheduler.log
+/data/v3/kind-worker-kubelet.log
+/data/v3/kube-apiserver.log
+/data/v3/kube-controller-manager.log
+/data/v3/kube-scheduler.log
diff --git a/test/integration/logs/benchmark/README.md b/test/integration/logs/benchmark/README.md
index d68a9a8deda..faa63d054e8 100644
--- a/test/integration/logs/benchmark/README.md
+++ b/test/integration/logs/benchmark/README.md
@@ -28,9 +28,29 @@ Prow job:
 - `artifacts/logs/kind-control-plane/containers`
 - `artifacts/logs/kind-*/kubelet.log`
 
-With sufficient credentials, `gsutil` can be used to download everything for a job with:
+With sufficient credentials, `gsutil` can be used to download everything for a job directly
+into a directory that then will be used by the benchmarks automatically:
+
 ```
-gsutil -m cp -R gs://kubernetes-jenkins/logs/ci-kubernetes-kind-e2e-json-logging/<job ID> .
+kubernetes$ test/integration/logs/benchmark/get-logs.sh
+++ dirname test/integration/logs/benchmark/get-logs.sh
++ cd test/integration/logs/benchmark
+++ latest_job
+++ gsutil cat gs://kubernetes-jenkins/logs/ci-kubernetes-kind-e2e-json-logging/latest-build.txt
++ job=1618864842834186240
++ rm -rf ci-kubernetes-kind-e2e-json-logging
++ mkdir ci-kubernetes-kind-e2e-json-logging
+...
+```
+
+This sets up the `data` directory so that additional test cases are available
+(`BenchmarkEncoding/v3/kind-worker-kubelet/`,
+`BenchmarkEncoding/kube-scheduler/`, etc.).
+
+
+To clean up, use
+```
+git clean -fx test/integration/logs/benchmark
 ```
 
 ## Analyzing log data
diff --git a/test/integration/logs/benchmark/benchmark_test.go b/test/integration/logs/benchmark/benchmark_test.go
index 6c071ce38cd..e45eed423f8 100644
--- a/test/integration/logs/benchmark/benchmark_test.go
+++ b/test/integration/logs/benchmark/benchmark_test.go
@@ -23,7 +23,6 @@ import (
 	"io"
 	"io/fs"
 	"os"
-	"path"
 	"path/filepath"
 	"regexp"
 	"strconv"
@@ -38,10 +37,11 @@ import (
 )
 
 func BenchmarkEncoding(b *testing.B) {
+	seen := map[string]bool{}
+
 	// Each "data/(v[0-9]/)?*.log" file is expected to contain JSON log
 	// messages. We generate one sub-benchmark for each file where logging
-	// is tested with the log level from the directory.  Symlinks can be
-	// used to test the same file with different levels.
+	// is tested with the log level from the directory.
 	if err := filepath.Walk("data", func(path string, info fs.FileInfo, err error) error {
 		if err != nil {
 			return err
@@ -53,8 +53,14 @@ func BenchmarkEncoding(b *testing.B) {
 		if err != nil {
 			return err
 		}
-		if info.Mode()&fs.ModeSymlink == 0 {
-			b.Log(path + "\n" + stats.String())
+		// Only print unique file statistics. They get shown for the
+		// first file where new statistics are encountered. The
+		// assumption here is that the there are no files with
+		// different content and exactly the same statistics.
+		statsStr := stats.String()
+		if !seen[statsStr] {
+			b.Log(path + "\n" + statsStr)
+			seen[statsStr] = true
 		}
 		b.Run(strings.TrimSuffix(strings.TrimPrefix(path, "data/"), ".log"), func(b *testing.B) {
 			// Take verbosity threshold from directory, if present.
@@ -108,7 +114,7 @@ func BenchmarkEncoding(b *testing.B) {
 				fileSizes["JSON"] = int(output) / b.N
 			})
 
-			b.Log(fmt.Sprintf("file sizes: %v\n", fileSizes))
+			b.Log(fmt.Sprintf("%s: file sizes: %v\n", path, fileSizes))
 		})
 		return nil
 	}); err != nil {
@@ -164,7 +170,7 @@ func benchmarkOutputFormats(b *testing.B, config loadGeneratorConfig, discard bo
 		var out *os.File
 		if !discard {
 			var err error
-			out, err = os.Create(path.Join(tmpDir, "all.log"))
+			out, err = os.Create(filepath.Join(tmpDir, "all.log"))
 			if err != nil {
 				b.Fatal(err)
 			}
@@ -177,12 +183,12 @@ func benchmarkOutputFormats(b *testing.B, config loadGeneratorConfig, discard bo
 		var out1, out2 *os.File
 		if !discard {
 			var err error
-			out1, err = os.Create(path.Join(tmpDir, "stream-1.log"))
+			out1, err = os.Create(filepath.Join(tmpDir, "stream-1.log"))
 			if err != nil {
 				b.Fatal(err)
 			}
 			defer out1.Close()
-			out2, err = os.Create(path.Join(tmpDir, "stream-2.log"))
+			out2, err = os.Create(filepath.Join(tmpDir, "stream-2.log"))
 			if err != nil {
 				b.Fatal(err)
 			}
diff --git a/test/integration/logs/benchmark/data/container.log b/test/integration/logs/benchmark/data/container.log
index 27ea85e898d..9c4afa10f66 100644
--- a/test/integration/logs/benchmark/data/container.log
+++ b/test/integration/logs/benchmark/data/container.log
@@ -1,2 +1,2 @@
 # This is a manually created message. See https://github.com/kubernetes/kubernetes/issues/106652 for the real one.
-Nov 19 02:13:48 kind-worker2 kubelet[250]: {"ts":1637288028968.0125,"caller":"kuberuntime/kuberuntime_manager.go:902","msg":"Creating container in pod","v":0,"container":{"Name":"terminate-cmd-rpn","Image":"registry.k8s.io/e2e-test-images/busybox:1.29-2","Command":["sh -c \nf=/restart-count/restartCount\ncount=$(echo 'hello' >> $f ; wc -l $f | awk {'print $1'})\nif [ $count -eq 1 ]; then\n\texit 1\nfi\nif [ $count -eq 2 ]; then\n\texit 0\nfi\nwhile true; do sleep 1; done\n"],"TerminationMessagePath":"/dev/termination-log"}}
+{"ts":1637288028968.0125,"caller":"kuberuntime/kuberuntime_manager.go:902","msg":"Creating container in pod","v":0,"container":{"Name":"terminate-cmd-rpn","Image":"registry.k8s.io/e2e-test-images/busybox:1.29-2","Command":["sh -c \nf=/restart-count/restartCount\ncount=$(echo 'hello' >> $f ; wc -l $f | awk {'print $1'})\nif [ $count -eq 1 ]; then\n\texit 1\nfi\nif [ $count -eq 2 ]; then\n\texit 0\nfi\nwhile true; do sleep 1; done\n"],"TerminationMessagePath":"/dev/termination-log"}}
diff --git a/test/integration/logs/benchmark/data/split.log b/test/integration/logs/benchmark/data/split.log
new file mode 100644
index 00000000000..1ca6885952f
--- /dev/null
+++ b/test/integration/logs/benchmark/data/split.log
@@ -0,0 +1,3 @@
+{"v": 0,
+"msg": "Pod status updated"}
+{"v": 0, "msg": "Pod status updated again"}
diff --git a/test/integration/logs/benchmark/get-logs.sh b/test/integration/logs/benchmark/get-logs.sh
new file mode 100755
index 00000000000..ec6c823a568
--- /dev/null
+++ b/test/integration/logs/benchmark/get-logs.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Usage: get-logs.sh [<job ID>]
+#
+# Downloads the latest job output or the one with the specified ID
+# and prepares running benchmarks for it.
+
+set -o pipefail
+set -o errexit
+set -x
+
+cd "$(dirname "$0")"
+
+latest_job () {
+    gsutil cat gs://kubernetes-jenkins/logs/ci-kubernetes-kind-e2e-json-logging/latest-build.txt
+}
+
+job=${1:-$(latest_job)}
+
+rm -rf ci-kubernetes-kind-e2e-json-logging
+mkdir ci-kubernetes-kind-e2e-json-logging
+gsutil -m cp -R "gs://kubernetes-jenkins/logs/ci-kubernetes-kind-e2e-json-logging/${job}/*" ci-kubernetes-kind-e2e-json-logging/
+
+for i in kube-apiserver kube-controller-manager kube-scheduler; do
+    # Before (container runtime log dump (?)):
+    #   2023-03-07T07:30:52.193301924Z stderr F {"ts":1678174252192.0676,"caller":"scheduler/schedule_one.go:81","msg":"Attempting to schedule pod","v":3,"pod":{"name":"simpletest.rc-zgd47","namespace":"gc-5422"}}
+    # After:
+    #   {"ts":1678174252192.0676,"caller":"scheduler/schedule_one.go:81","msg":"Attempting to schedule pod","v":3,"pod":{"name":"simpletest.rc-zgd47","namespace":"gc-5422"}}
+    sed -e 's/^20[^ ]* stderr . //' \
+        ci-kubernetes-kind-e2e-json-logging/artifacts/kind-control-plane/containers/$i-*.log \
+        > ci-kubernetes-kind-e2e-json-logging/$i.log;
+done
+
+# Before (systemd format):
+#   Mar 07 07:22:05 kind-control-plane kubelet[288]: {"ts":1678173725722.4487,"caller":"flag/flags.go:64","msg":"FLAG: --address=\"0.0.0.0\"\n","v":1}
+# After:
+#   {"ts":1678173725722.4487,"caller":"flag/flags.go:64","msg":"FLAG: --address=\"0.0.0.0\"\n","v":1}
+grep 'kind-worker kubelet' ci-kubernetes-kind-e2e-json-logging/artifacts/kind-worker/kubelet.log | \
+    sed -e 's;^.* kind-worker kubelet[^ ]*: ;;' > ci-kubernetes-kind-e2e-json-logging/kind-worker-kubelet.log
+
+# Create copies of the actual files, whether they already exist or not. To
+# clean up disk space, use "git clean -fx test/integration/logs/benchmark".
+copy () {
+    from="$1"
+    to="$2"
+
+    mkdir -p "$(dirname "$to")"
+    rm -f "$to"
+    cp "$from" "$to"
+}
+copy ci-kubernetes-kind-e2e-json-logging/kind-worker-kubelet.log data/kind-worker-kubelet.log
+copy ci-kubernetes-kind-e2e-json-logging/kube-apiserver.log data/kube-apiserver.log
+copy ci-kubernetes-kind-e2e-json-logging/kube-controller-manager.log data/kube-controller-manager.log
+copy ci-kubernetes-kind-e2e-json-logging/kube-scheduler.log data/kube-scheduler.log
+
+copy ci-kubernetes-kind-e2e-json-logging/kind-worker-kubelet.log data/v3/kind-worker-kubelet.log
+copy ci-kubernetes-kind-e2e-json-logging/kube-apiserver.log data/v3/kube-apiserver.log
+copy ci-kubernetes-kind-e2e-json-logging/kube-controller-manager.log data/v3/kube-controller-manager.log
+copy ci-kubernetes-kind-e2e-json-logging/kube-scheduler.log data/v3/kube-scheduler.log
diff --git a/test/integration/logs/benchmark/load.go b/test/integration/logs/benchmark/load.go
index ccb5727b146..f6afa20604a 100644
--- a/test/integration/logs/benchmark/load.go
+++ b/test/integration/logs/benchmark/load.go
@@ -29,7 +29,7 @@ import (
 	"strings"
 	"text/template"
 
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	"k8s.io/klog/v2"
 )
 
@@ -52,7 +52,7 @@ const (
 )
 
 type logStats struct {
-	TotalLines, JsonLines, ErrorMessages int
+	TotalLines, JsonLines, SplitLines, ErrorMessages int
 
 	ArgCounts     map[string]int
 	OtherLines    []string
@@ -73,10 +73,11 @@ var (
 			return x - y
 		},
 	}).Parse(`Total number of lines: {{.TotalLines}}
+JSON line continuation: {{.SplitLines}}
 Valid JSON messages: {{.JsonLines}} ({{percent .JsonLines .TotalLines}} of total lines)
 Error messages: {{.ErrorMessages}} ({{percent .ErrorMessages .JsonLines}} of valid JSON messages)
-Unrecognized lines: {{sub .TotalLines .JsonLines}}
-{{range .OtherLines}} {{.}}
+Unrecognized lines: {{sub (sub .TotalLines .JsonLines) .SplitLines}}
+{{range .OtherLines}} {{if gt (len .) 80}}{{slice . 0 80}}{{else}}{{.}}{{end}}
 {{end}}
 Args:
  total: {{if .ArgCounts.total}}{{.ArgCounts.total}}{{else}}0{{end}}{{if .ArgCounts.string}}
@@ -119,14 +120,28 @@ func loadLog(path string) (messages []logMessage, stats logStats, err error) {
 
 	stats.ArgCounts = map[string]int{}
 	scanner := bufio.NewScanner(file)
+	var buffer bytes.Buffer
 	for lineNo := 0; scanner.Scan(); lineNo++ {
+		stats.TotalLines++
 		line := scanner.Bytes()
-		msg, err := parseLine(line, &stats)
+		buffer.Write(line)
+		msg, err := parseLine(buffer.Bytes(), &stats)
 		if err != nil {
+			// JSON might have been split across multiple lines.
+			var jsonErr *json.SyntaxError
+			if errors.As(err, &jsonErr) && jsonErr.Offset > 1 {
+				// The start of the buffer was okay. Keep the
+				// data and add the next line to it.
+				stats.SplitLines++
+				continue
+			}
 			stats.OtherLines = append(stats.OtherLines, fmt.Sprintf("%d: %s", lineNo, string(line)))
+			buffer.Reset()
 			continue
 		}
+		stats.JsonLines++
 		messages = append(messages, msg)
+		buffer.Reset()
 	}
 
 	if err := scanner.Err(); err != nil {
@@ -136,26 +151,16 @@ func loadLog(path string) (messages []logMessage, stats logStats, err error) {
 	return
 }
 
-// systemd prefix:
-// Nov 19 02:08:51 kind-worker2 kubelet[250]: {"ts":1637287731687.8315,...
-//
-// kubectl (?) prefix:
-// 2021-11-19T02:08:28.475825534Z stderr F {"ts": ...
-var prefixRE = regexp.MustCompile(`^\w+ \d+ \S+ \S+ \S+: |\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z stderr . `)
-
 // String format for API structs from generated.pb.go.
 // &Container{...}
 var objectRE = regexp.MustCompile(`^&([a-zA-Z]*)\{`)
 
 func parseLine(line []byte, stats *logStats) (item logMessage, err error) {
-	stats.TotalLines++
-	line = prefixRE.ReplaceAll(line, nil)
 
 	content := map[string]interface{}{}
 	if err := json.Unmarshal(line, &content); err != nil {
-		return logMessage{}, fmt.Errorf("JSON parsing failed: %v", err)
+		return logMessage{}, fmt.Errorf("JSON parsing failed: %w", err)
 	}
-	stats.JsonLines++
 
 	kvs := map[string]interface{}{}
 	item.isError = true
diff --git a/test/integration/logs/benchmark/load_test.go b/test/integration/logs/benchmark/load_test.go
index 5ccd2e0b1c8..bc95b5e8eb0 100644
--- a/test/integration/logs/benchmark/load_test.go
+++ b/test/integration/logs/benchmark/load_test.go
@@ -58,6 +58,31 @@ func TestData(t *testing.T) {
 				ArgCounts:  map[string]int{},
 			},
 		},
+		"data/split.log": {
+			messages: []logMessage{
+				{
+					msg: "Pod status updated",
+				},
+				{
+					msg: "Pod status updated again",
+				},
+			},
+			printf: `Pod status updated: []
+Pod status updated again: []
+`,
+			structured: `"Pod status updated"
+"Pod status updated again"
+`,
+			json: `{"msg":"Pod status updated","v":0}
+{"msg":"Pod status updated again","v":0}
+`,
+			stats: logStats{
+				TotalLines: 3,
+				SplitLines: 1,
+				JsonLines:  2,
+				ArgCounts:  map[string]int{},
+			},
+		},
 		"data/error.log": {
 			messages: []logMessage{
 				{