From d169c81496b386d588fd5c25c4af34aed242d1a8 Mon Sep 17 00:00:00 2001 From: Ryan Phillips Date: Wed, 3 Mar 2021 19:32:02 -0600 Subject: [PATCH] check log directory for restartCount --- .../kuberuntime/kuberuntime_container.go | 53 +++++++++++++++++++ .../kuberuntime/kuberuntime_container_test.go | 44 +++++++++++++++ 2 files changed, 97 insertions(+) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index 5720576e07f..0ed4433246d 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -21,12 +21,15 @@ import ( "errors" "fmt" "io" + "io/ioutil" "math/rand" "net/url" "os" "path/filepath" + "regexp" goruntime "runtime" "sort" + "strconv" "strings" "sync" "time" @@ -127,6 +130,40 @@ func (s *startSpec) getTargetID(podStatus *kubecontainer.PodStatus) (*kubecontai return &targetStatus.ID, nil } +func calcRestartCountByLogDir(path string) (int, error) { + // if the path doesn't exist then it's not an error + if _, err := os.Stat(path); err != nil { + return 0, nil + } + restartCount := int(0) + files, err := ioutil.ReadDir(path) + if err != nil { + return 0, err + } + if len(files) == 0 { + return 0, err + } + restartCountLogFileRegex := regexp.MustCompile(`(\d+).log(\..*)?`) + for _, file := range files { + if file.IsDir() { + continue + } + matches := restartCountLogFileRegex.FindStringSubmatch(file.Name()) + if len(matches) == 0 { + continue + } + count, err := strconv.Atoi(matches[1]) + if err != nil { + return restartCount, err + } + count++ + if count > restartCount { + restartCount = count + } + } + return restartCount, nil +} + // startContainer starts a container and returns a message indicates why it is failed on error. // It starts the container through the following steps: // * pull the image @@ -150,6 +187,22 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb containerStatus := podStatus.FindContainerStatusByName(container.Name) if containerStatus != nil { restartCount = containerStatus.RestartCount + 1 + } else { + // The container runtime keeps state on container statuses and + // what the container restart count is. When nodes are rebooted + // some container runtimes clear their state which causes the + // restartCount to be reset to 0. This causes the logfile to + // start at 0.log, which either overwrites or appends to the + // already existing log. + // + // We are checking to see if the log directory exists, and find + // the latest restartCount by checking the log name - + // {restartCount}.log - and adding 1 to it. + logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name) + restartCount, err = calcRestartCountByLogDir(logDir) + if err != nil { + klog.InfoS("Log directory exists but could not calculate restartCount", "logDir", logDir, "err", err) + } } target, err := spec.getTargetID(podStatus) diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_test.go b/pkg/kubelet/kuberuntime/kuberuntime_container_test.go index 51ad8c73c9a..1a1ae2b65e4 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_test.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_test.go @@ -18,6 +18,8 @@ package kuberuntime import ( "fmt" + "io/ioutil" + "os" "path/filepath" "regexp" "strings" @@ -422,3 +424,45 @@ func TestStartSpec(t *testing.T) { }) } } + +func TestRestartCountByLogDir(t *testing.T) { + for _, tc := range []struct { + filenames []string + restartCount int + }{ + { + filenames: []string{"0.log.rotated-log"}, + restartCount: 1, + }, + { + filenames: []string{"0.log"}, + restartCount: 1, + }, + { + filenames: []string{"0.log", "1.log", "2.log"}, + restartCount: 3, + }, + { + filenames: []string{"0.log.rotated", "1.log", "2.log"}, + restartCount: 3, + }, + { + filenames: []string{"5.log.rotated", "6.log.rotated"}, + restartCount: 7, + }, + { + filenames: []string{"5.log.rotated", "6.log", "7.log"}, + restartCount: 8, + }, + } { + tempDirPath, err := ioutil.TempDir("", "test-restart-count-") + assert.NoError(t, err, "create tempdir error") + defer os.RemoveAll(tempDirPath) + for _, filename := range tc.filenames { + err = ioutil.WriteFile(filepath.Join(tempDirPath, filename), []byte("a log line"), 0600) + assert.NoError(t, err, "could not write log file") + } + count, _ := calcRestartCountByLogDir(tempDirPath) + assert.Equal(t, count, tc.restartCount, "count %v should equal restartCount %v", count, tc.restartCount) + } +}