mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-21 10:51:29 +00:00
test: Stop kubelet systemd service after node e2e
Currently, when running node e2e it's not possible to use the ginkgo `--repeat` flag to run the test suite multiple times. This is useful when debugging tests and ensuring they are not flaky by re-running them several times. Currently if using `--repeat` ginkgo flag, the 2nd run of the test will fail due to kubelet not starting with message like: ``` Failed to start transient service unit: Unit kubelet-20221020T040841.service already exists. ``` This is because during the test startup, kubelet is started as a transient unit file via `systemd-run`. The unit is started with the `--remain-after-exit` flag to ensure that the unit will remain even if the kubelet is restarted. The test suite currently uses `systemd kill` command to stop kubelet. This works fine for stopping the kubelet, but on the second run, when `systemd-run` is used to start systemd unit again it will fail because the unit already exists. This is because `systemd kill` will not delete the systemd unit, only send SIGTERM signal to it. To fix this, add `unitName` as a field to the `server` struct. When kubelet server is constructed, set the unit name. As part of e2e test termination, in `E2EServices.Stop()``, stop the kubelet systemd unit. By stopping the kubelet systemd unit, systemd will delete the systemd transient unit, allowing it to be created and started again in a subsequent e2e run. Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
parent
f14ebac384
commit
048ed7ddc0
@ -198,6 +198,7 @@ func (e *E2EServices) startKubelet(featureGates map[string]bool) (*server, error
|
||||
|
||||
var killCommand, restartCommand *exec.Cmd
|
||||
var isSystemd bool
|
||||
var unitName string
|
||||
// Apply default kubelet flags.
|
||||
cmdArgs := []string{}
|
||||
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
|
||||
@ -226,7 +227,7 @@ func (e *E2EServices) startKubelet(featureGates map[string]bool) (*server, error
|
||||
cwd, _ := os.Getwd()
|
||||
// Use the timestamp from the current directory to name the systemd unit.
|
||||
unitTimestamp := remote.GetTimestampFromWorkspaceDir(cwd)
|
||||
unitName := fmt.Sprintf("kubelet-%s.service", unitTimestamp)
|
||||
unitName = fmt.Sprintf("kubelet-%s.service", unitTimestamp)
|
||||
cmdArgs = append(cmdArgs,
|
||||
systemdRun,
|
||||
"-p", "Delegate=true",
|
||||
@ -299,7 +300,8 @@ func (e *E2EServices) startKubelet(featureGates map[string]bool) (*server, error
|
||||
[]string{kubeletHealthCheckURL},
|
||||
"kubelet.log",
|
||||
e.monitorParent,
|
||||
restartOnExit)
|
||||
restartOnExit,
|
||||
unitName)
|
||||
return server, server.start()
|
||||
}
|
||||
|
||||
|
@ -64,11 +64,13 @@ type server struct {
|
||||
stopRestartingCh chan<- bool
|
||||
// Read from this to confirm that the restart loop has stopped.
|
||||
ackStopRestartingCh <-chan bool
|
||||
// The systemd unit name for the service if it exists. If server is not managed by systemd, field is empty.
|
||||
systemdUnitName string
|
||||
}
|
||||
|
||||
// newServer returns a new server with the given name, commands, health check
|
||||
// URLs, etc.
|
||||
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool) *server {
|
||||
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool, systemdUnitName string) *server {
|
||||
return &server{
|
||||
name: name,
|
||||
startCommand: start,
|
||||
@ -78,6 +80,7 @@ func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outpu
|
||||
outFilename: outputFileName,
|
||||
monitorParent: monitorParent,
|
||||
restartOnExit: restartOnExit,
|
||||
systemdUnitName: systemdUnitName,
|
||||
}
|
||||
}
|
||||
|
||||
@ -313,3 +316,14 @@ func (s *server) kill() error {
|
||||
|
||||
return fmt.Errorf("unable to stop %q", name)
|
||||
}
|
||||
|
||||
func (s *server) stopUnit() error {
|
||||
klog.Infof("Stopping systemd unit for server %q with unit name: %q", s.name, s.systemdUnitName)
|
||||
if s.systemdUnitName != "" {
|
||||
err := exec.Command("sudo", "systemctl", "stop", s.systemdUnitName).Run()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to stop systemd unit name: %q: %v", s.systemdUnitName, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -95,7 +95,11 @@ func (e *E2EServices) Stop() {
|
||||
}
|
||||
if e.kubelet != nil {
|
||||
if err := e.kubelet.kill(); err != nil {
|
||||
klog.Errorf("Failed to stop kubelet: %v", err)
|
||||
klog.Errorf("Failed to kill kubelet: %v", err)
|
||||
}
|
||||
// Stop the kubelet systemd unit which will delete the kubelet transient unit.
|
||||
if err := e.kubelet.stopUnit(); err != nil {
|
||||
klog.Errorf("Failed to stop kubelet systemd unit: %v", err)
|
||||
}
|
||||
}
|
||||
for _, d := range e.rmDirs {
|
||||
@ -134,7 +138,7 @@ func (e *E2EServices) startInternalServices() (*server, error) {
|
||||
[]string{"--run-services-mode", fmt.Sprintf("--bearer-token=%s", framework.TestContext.BearerToken)},
|
||||
os.Args[1:]...,
|
||||
)...)
|
||||
server := newServer("services", startCmd, nil, nil, getServicesHealthCheckURLs(), servicesLogFile, e.monitorParent, false)
|
||||
server := newServer("services", startCmd, nil, nil, getServicesHealthCheckURLs(), servicesLogFile, e.monitorParent, false, "")
|
||||
return server, server.start()
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user