test: Stop kubelet systemd service after node e2e

Currently, when running node e2e it's not possible to use the ginkgo `--repeat`
flag to run the test suite multiple times. This is useful when debugging tests
and ensuring they are not flaky by re-running them several times. Currently if
using `--repeat` ginkgo flag, the 2nd run of the test will fail due to kubelet
not starting with message like:

```
Failed to start transient service unit: Unit kubelet-20221020T040841.service already exists.
```

This is because during the test startup, kubelet is started as a transient unit
file via `systemd-run`. The unit is started with the `--remain-after-exit` flag
to ensure that the unit will remain even if the kubelet is restarted. The test
suite currently uses `systemd kill` command to stop kubelet. This works fine for
stopping the kubelet, but on the second run, when `systemd-run` is used to start
systemd unit again it will fail because the unit already exists. This is because
`systemd kill` will not delete the systemd unit, only send SIGTERM signal to it.

To fix this, add `unitName` as a field to the `server` struct. When
kubelet server is constructed, set the unit name. As part of e2e test
termination, in `E2EServices.Stop()``, stop the kubelet systemd unit. By
stopping the kubelet systemd unit, systemd will delete the systemd
transient unit, allowing it to be created and started again in a
subsequent e2e run.

Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
David Porter 2022-10-19 22:50:41 -07:00
parent f14ebac384
commit 048ed7ddc0
3 changed files with 25 additions and 5 deletions

View File

@ -198,6 +198,7 @@ func (e *E2EServices) startKubelet(featureGates map[string]bool) (*server, error
var killCommand, restartCommand *exec.Cmd
var isSystemd bool
var unitName string
// Apply default kubelet flags.
cmdArgs := []string{}
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
@ -226,7 +227,7 @@ func (e *E2EServices) startKubelet(featureGates map[string]bool) (*server, error
cwd, _ := os.Getwd()
// Use the timestamp from the current directory to name the systemd unit.
unitTimestamp := remote.GetTimestampFromWorkspaceDir(cwd)
unitName := fmt.Sprintf("kubelet-%s.service", unitTimestamp)
unitName = fmt.Sprintf("kubelet-%s.service", unitTimestamp)
cmdArgs = append(cmdArgs,
systemdRun,
"-p", "Delegate=true",
@ -299,7 +300,8 @@ func (e *E2EServices) startKubelet(featureGates map[string]bool) (*server, error
[]string{kubeletHealthCheckURL},
"kubelet.log",
e.monitorParent,
restartOnExit)
restartOnExit,
unitName)
return server, server.start()
}

View File

@ -64,11 +64,13 @@ type server struct {
stopRestartingCh chan<- bool
// Read from this to confirm that the restart loop has stopped.
ackStopRestartingCh <-chan bool
// The systemd unit name for the service if it exists. If server is not managed by systemd, field is empty.
systemdUnitName string
}
// newServer returns a new server with the given name, commands, health check
// URLs, etc.
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool) *server {
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool, systemdUnitName string) *server {
return &server{
name: name,
startCommand: start,
@ -78,6 +80,7 @@ func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outpu
outFilename: outputFileName,
monitorParent: monitorParent,
restartOnExit: restartOnExit,
systemdUnitName: systemdUnitName,
}
}
@ -313,3 +316,14 @@ func (s *server) kill() error {
return fmt.Errorf("unable to stop %q", name)
}
func (s *server) stopUnit() error {
klog.Infof("Stopping systemd unit for server %q with unit name: %q", s.name, s.systemdUnitName)
if s.systemdUnitName != "" {
err := exec.Command("sudo", "systemctl", "stop", s.systemdUnitName).Run()
if err != nil {
return fmt.Errorf("Failed to stop systemd unit name: %q: %v", s.systemdUnitName, err)
}
}
return nil
}

View File

@ -95,7 +95,11 @@ func (e *E2EServices) Stop() {
}
if e.kubelet != nil {
if err := e.kubelet.kill(); err != nil {
klog.Errorf("Failed to stop kubelet: %v", err)
klog.Errorf("Failed to kill kubelet: %v", err)
}
// Stop the kubelet systemd unit which will delete the kubelet transient unit.
if err := e.kubelet.stopUnit(); err != nil {
klog.Errorf("Failed to stop kubelet systemd unit: %v", err)
}
}
for _, d := range e.rmDirs {
@ -134,7 +138,7 @@ func (e *E2EServices) startInternalServices() (*server, error) {
[]string{"--run-services-mode", fmt.Sprintf("--bearer-token=%s", framework.TestContext.BearerToken)},
os.Args[1:]...,
)...)
server := newServer("services", startCmd, nil, nil, getServicesHealthCheckURLs(), servicesLogFile, e.monitorParent, false)
server := newServer("services", startCmd, nil, nil, getServicesHealthCheckURLs(), servicesLogFile, e.monitorParent, false, "")
return server, server.start()
}