Merge pull request #31093 from Random-Liu/containerize-node-e2e-test

Automatic merge from submit-queue Node Conformance Test: Containerize the node e2e test For #30122, #30174. Based on #32427, #32454. **Please only review the last 3 commits.** This PR packages the node e2e test into a docker image: - 1st commit: Add `NodeConformance` flag in the node e2e framework to avoid starting kubelet and collecting system logs. We do this because: - There are all kinds of ways to manage kubelet and system logs, for different situation we need to mount different things into the container, run different commands. It is hard and unnecessary to handle the complexity inside the test suite. - 2nd commit: Remove all `sudo` in the test container. We do this because: - In most container, there is no `sudo` command, and there is no need to use `sudo` inside the container. - It introduces some complexity to use `sudo` inside the test. (https://github.com/kubernetes/kubernetes/issues/29211, https://github.com/kubernetes/kubernetes/issues/26748) In fact we just need to run the test suite with `sudo`. - 3rd commit: Package the test into a docker container with corresponding `Makefile` and `Dockerfile`. We also added a `run_test.sh` script to start kubelet and run the test container. The script is only for demonstration purpose and we'll also use the script in our node e2e framework. In the future, we should update the script to start kubelet in production way (maybe with `systemd` or `supervisord`). @dchen1107 @vishh /cc @kubernetes/sig-node @kubernetes/sig-testing **Release note**:  ``` release-note Release alpha version node test container gcr.io/google_containers/node-test-ARCH:0.1 for users to verify their node setup. ```
2025-09-13 05:02:50 +00:00 · 2016-11-07 23:41:25 -08:00
parent d8fa6a99a2 9345e12bc9
commit 0df6384770
12 changed files with 390 additions and 88 deletions
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@@ -102,6 +102,8 @@ type TestContextType struct {
 type NodeTestContextType struct {
 	// Name of the node to run tests on (node e2e suite only).
 	NodeName string
+	// NodeConformance indicates whether the test is running in node conformance mode.
+	NodeConformance bool
 	// DisableKubenet disables kubenet when starting kubelet.
 	DisableKubenet bool
 	// Whether to enable the QoS Cgroup Hierarchy or not
@@ -209,6 +211,13 @@ func RegisterClusterFlags() {
 // Register flags specific to the node e2e test suite.
 func RegisterNodeFlags() {
 	flag.StringVar(&TestContext.NodeName, "node-name", "", "Name of the node to run tests on (node e2e suite only).")
+	// TODO(random-liu): Move kubelet start logic out of the test.
+	// TODO(random-liu): Move log fetch logic out of the test.
+	// There are different ways to start kubelet (systemd, initd, docker, rkt, manually started etc.)
+	// and manage logs (journald, upstart etc.).
+	// For different situation we need to mount different things into the container, run different commands.
+	// It is hard and unnecessary to deal with the complexity inside the test suite.
+	flag.BoolVar(&TestContext.NodeConformance, "conformance", false, "If true, the test suite will not start kubelet, and fetch system log (kernel, docker, kubelet log etc.) to the report directory.")
 	// TODO(random-liu): Remove kubelet related flags when we move the kubelet start logic out of the test.
 	// TODO(random-liu): Find someway to get kubelet configuration, and automatic config and filter test based on the configuration.
 	flag.BoolVar(&TestContext.DisableKubenet, "disable-kubenet", false, "If true, start kubelet without kubenet. (default false)")
--- a/test/e2e_node/apparmor_test.go
+++ b/test/e2e_node/apparmor_test.go
@@ -119,6 +119,7 @@ func loadTestProfiles() error {
 		return fmt.Errorf("failed to write profiles to file: %v", err)
 	}

+	// TODO(random-liu): The test is run as root now, no need to use sudo here.
 	cmd := exec.Command("sudo", "apparmor_parser", "-r", "-W", f.Name())
 	stderr := &bytes.Buffer{}
 	cmd.Stderr = stderr
--- a/test/e2e_node/conformance/build/Dockerfile
+++ b/test/e2e_node/conformance/build/Dockerfile
@@ -0,0 +1,44 @@
+# Copyright 2016 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM BASEIMAGE
+
+COPY ginkgo /usr/local/bin/
+COPY e2e_node.test /usr/local/bin
+
+# The following environment variables can be override when starting the container.
+# FOCUS is regex matching test to run. By default run all conformance test.
+# SKIP is regex matching test to skip. By default empty.
+# PARALLELISM is the number of processes the test will run in parallel.
+# REPORT_PATH is the path in the container to save test result and logs.
+# MANIFEST_PATH is the kubelet manifest path in the container.
+# FLAKE_ATTEMPTS is the time to retry when there is a test failure. By default 2.
+# TEST_ARGS is the test arguments passed into the test.
+ENV FOCUS="\[Conformance\]" \
+	   SKIP="\[Flaky\]|\[Serial\]" \
+	   PARALLELISM=8 \
+	   REPORT_PATH="/var/result" \
+	   MANIFEST_PATH="/etc/manifest" \
+	   FLAKE_ATTEMPTS=2 \
+	   TEST_ARGS=""
+
+ENTRYPOINT ginkgo --focus="$FOCUS" \
+	--skip="$SKIP" \
+	--nodes=$PARALLELISM \
+	--flakeAttempts=$FLAKE_ATTEMPTS \
+	/usr/local/bin/e2e_node.test \
+	-- --conformance=true \
+	--prepull-images=false \
+	--manifest-path="$MANIFEST_PATH"\
+	--report-dir="$REPORT_PATH $TEST_ARGS"
--- a/test/e2e_node/conformance/build/Makefile
+++ b/test/e2e_node/conformance/build/Makefile
@@ -0,0 +1,60 @@
+# Copyright 2016 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build the node-test image.
+#
+# Usage:
+#   [ARCH=amd64] [REGISTRY="gcr.io/google_containers"] [BIN_DIR="../../../../_output/bin"] make (build|push) VERSION={some_version_number e.g. 0.1}
+
+# TODO(random-liu): Add this into release progress.
+REGISTRY?=gcr.io/google_containers
+ARCH?=amd64
+# BIN_DIR is the directory to find binaries, overwrite with ../../../../_output/bin
+# for local development.
+BIN_DIR?=../../../../_output/dockerized/bin/linux/${ARCH}
+TEMP_DIR:=$(shell mktemp -d)
+
+BASEIMAGE_amd64=debian:jessie
+BASEIMAGE_arm=armel/debian:jessie
+BASEIMAGE_arm64=aarch64/debian:jessie
+BASEIMAGE_ppc64le=ppc64le/debian:jessie
+
+BASEIMAGE?=${BASEIMAGE_${ARCH}}
+
+all: build
+
+build:
+
+ifndef VERSION
+    $(error VERSION is undefined)
+endif
+	cp -r ./* ${TEMP_DIR}
+
+	cp ${BIN_DIR}/ginkgo ${TEMP_DIR}
+	cp ${BIN_DIR}/e2e_node.test ${TEMP_DIR}
+
+	cd ${TEMP_DIR} && sed -i.back "s|BASEIMAGE|${BASEIMAGE}|g" Dockerfile
+
+	# Make scripts executable before they are copied into the Docker image. If we make them executable later, in another layer
+	# they'll take up twice the space because the new executable binary differs from the old one, but everything is cached in layers.
+	cd ${TEMP_DIR} && chmod a+rx \
+		e2e_node.test \
+		ginkgo
+
+	docker build -t ${REGISTRY}/node-test-${ARCH}:${VERSION} ${TEMP_DIR}
+
+push: build
+	gcloud docker push ${REGISTRY}/node-test-${ARCH}:${VERSION}
+
+.PHONY: all
--- a/test/e2e_node/conformance/run_test.sh
+++ b/test/e2e_node/conformance/run_test.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+
+# Copyright 2016 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script is only for demonstrating how to use the node test container. In
+# production environment, kubelet bootstrap will be more complicated, user
+# should configure the node test container accordingly.
+# In addition, this script will also be used in the node e2e test to let it use
+# the containerized test suite.
+
+# TODO(random-liu): Use standard installer to install kubelet.
+# TODO(random-liu): Use standard tool to start kubelet in production way (such
+# as systemd, supervisord etc.)
+# TODO(random-liu): Initialize kubelet with standard configmap after dynamic
+# configuration landing, so that all test could get the current kubelet
+# configuration and react accordingly.
+
+# Refresh sudo credentials if not running on GCE.
+if ! ping -c 1 -q metadata.google.internal &> /dev/null; then
+  sudo -v || exit 1
+fi
+
+# FOCUS is ginkgo focus to select which tests to run. By default, FOCUS is
+# initialized as "\[Conformance\]" in the test container to run all conformance
+# test.
+FOCUS=${FOCUS:-""}
+
+# SKIP is ginkgo skip to select which tests to skip. By default, SKIP is
+# initialized as "\[Flaky\]|\[Serial\]" in the test container skipping all
+# flaky and serial test.
+SKIP=${SKIP:-""}
+
+# REGISTRY is the image registry for node test image.
+REGISTRY=${REGISTRY:-"gcr.io/google_containers"}
+
+# ARCH is the architecture of current machine, the script will use this to
+# select corresponding test container image.
+ARCH=${ARCH:-"amd64"}
+
+# VERSION is the version of the test container image.
+VERSION=${VERSION:-"0.1"}
+
+# KUBELET_BIN is the kubelet binary name. If it is not specified, use the
+# default binary name "kubelet".
+KUBELET_BIN=${KUBELET_BIN:-"kubelet"}
+
+# KUBELET is the kubelet binary path. If it is not specified, assume kubelet is
+# in PATH.
+KUBELET=${KUBELET:-"`which $KUBELET_BIN`"}
+
+# LOG_DIR is the absolute path of the directory where the test will collect all
+# logs to. By default, use the current directory.
+LOG_DIR=${LOG_DIR:-`pwd`}
+mkdir -p $LOG_DIR
+
+# NETWORK_PLUGIN is the network plugin used by kubelet. Do not use network
+# plugin by default.
+NETWORK_PLUGIN=${NETWORK_PLUGIN:-""}
+
+# NETWORK_PLUGIN_PATH is the path to network plugin binary.
+NETWORK_PLUGIN_PATH=${NETWORK_PLUGIN_PATH:-""}
+
+# start_kubelet starts kubelet and redirect kubelet log to $LOG_DIR/kubelet.log.
+kubelet_log=kubelet.log
+start_kubelet() {
+  echo "Starting kubelet..."
+  sudo -b $KUBELET $@ &>$LOG_DIR/$kubelet_log
+  if [ $? -ne 0 ]; then
+    echo "Failed to start kubelet"
+    exit 1
+  fi
+}
+
+# wait_kubelet retris for 10 times for kubelet to be ready by checking http://127.0.0.1:10255/healthz.
+wait_kubelet() {
+  echo "Health checking kubelet..."
+  healthCheckURL=http://127.0.0.1:10255/healthz
+  local maxRetry=10
+  local cur=1
+  while [ $cur -le $maxRetry ]; do
+    curl -s $healthCheckURL > /dev/null
+    if [ $? -eq 0 ]; then
+      echo "Kubelet is ready"
+      break
+    fi
+    if [ $cur -eq $maxRetry]; then
+      echo "Health check exceeds max retry"
+      exit 1
+    fi
+    echo "Kubelet is not ready"
+    sleep 1
+    ((cur++))
+  done
+}
+
+# kill_kubelet kills kubelet.
+kill_kubelet() {
+  echo "Stopping kubelet..."
+  sudo pkill $KUBELET_BIN
+  if [ $? -ne 0 ]; then
+    echo "Failed to stop kubelet."
+    exit 1
+  fi
+}
+
+# run_test runs the node test container.
+run_test() {
+  env=""
+  if [ ! -z "$FOCUS" ]; then
+    env="$env -e FOCUS=$FOCUS"
+  fi
+  if [ ! -z "$SKIP" ]; then
+    env="$env -e SKIP=$SKIP"
+  fi
+  # The test assumes that inside the container:
+  # * kubelet manifest path is mounted to /etc/manifest;
+  # * log collect directory is mounted to /var/result;
+  # * root file system is mounted to /rootfs.
+  sudo docker run -it --rm --privileged=true --net=host  -v /:/rootfs \
+    -v $config_dir:/etc/manifest -v $LOG_DIR:/var/result $env $REGISTRY/node-test-$ARCH:$VERSION
+}
+
+# Check whether kubelet is running. If kubelet is running, tell the user to stop
+# it before running the test.
+pid=`pidof $KUBELET_BIN`
+if [ ! -z $pid ]; then
+  echo "Kubelet is running (pid=$pid), please stop it before running the test."
+  exit 1
+fi
+
+apiserver=http://localhost:8080
+volume_stats_agg_period=10s
+allow_privileged=true
+serialize_image_pulls=false
+config_dir=`mktemp -d`
+file_check_frequency=10s
+pod_cidr=10.180.0.0/24
+log_level=4
+start_kubelet --api-servers $apiserver \
+  --volume-stats-agg-period $volume_stats_agg_period \
+  --allow-privileged=$allow_privileged \
+  --serialize-image-pulls=$serialize_image_pulls \
+  --config $config_dir \
+  --file-check-frequency $file_check_frequency \
+  --pod-cidr=$pod_cidr \
+  --runtime-cgroups=/docker-daemon \
+  --kubelet-cgroups=/kubelet \
+  --system-cgroups=/system \
+  --cgroup-root=/ \
+  --network-plugin=$NETWORK_PLUGIN \
+  --network-plugin-dir=$NETWORK_PLUGIN_PATH \
+  --v=$log_level \
+  --logtostderr
+
+wait_kubelet
+
+run_test
+
+kill_kubelet
+
+# Clean up the kubelet config directory
+sudo rm -rf $config_dir
--- a/test/e2e_node/e2e_node_suite_test.go
+++ b/test/e2e_node/e2e_node_suite_test.go
@@ -28,6 +28,7 @@ import (
 	"os"
 	"os/exec"
 	"path"
+	"syscall"
 	"testing"
 	"time"

@@ -71,6 +72,10 @@ func TestMain(m *testing.M) {
 	os.Exit(m.Run())
 }

+// When running the containerized conformance test, we'll mount the
+// host root filesystem as readonly to /rootfs.
+const rootfs = "/rootfs"
+
 func TestE2eNode(t *testing.T) {
 	if *runServicesMode {
 		// If run-services-mode is specified, only run services in current process.
@@ -79,6 +84,15 @@ func TestE2eNode(t *testing.T) {
 	}
 	if *systemValidateMode {
 		// If system-validate-mode is specified, only run system validation in current process.
+		if framework.TestContext.NodeConformance {
+			// Chroot to /rootfs to make system validation can check system
+			// as in the root filesystem.
+			// TODO(random-liu): Consider to chroot the whole test process to make writing
+			// test easier.
+			if err := syscall.Chroot(rootfs); err != nil {
+				glog.Exitf("chroot %q failed: %v", rootfs, err)
+			}
+		}
 		if err := system.Validate(); err != nil {
 			glog.Exitf("system validation failed: %v", err)
 		}
@@ -172,12 +186,12 @@ func validateSystem() error {
 	if err != nil {
 		return fmt.Errorf("can't get current binary: %v", err)
 	}
-	// TODO(random-liu): Remove sudo in containerize PR.
-	output, err := exec.Command("sudo", testBin, "--system-validate-mode").CombinedOutput()
+	// Pass all flags into the child process, so that it will see the same flag set.
+	output, err := exec.Command(testBin, append([]string{"--system-validate-mode"}, os.Args[1:]...)...).CombinedOutput()
 	// The output of system validation should have been formatted, directly print here.
 	fmt.Print(string(output))
 	if err != nil {
-		return fmt.Errorf("system validation failed")
+		return fmt.Errorf("system validation failed: %v", err)
 	}
 	return nil
 }
@@ -190,7 +204,7 @@ func maskLocksmithdOnCoreos() {
 		return
 	}
 	if bytes.Contains(data, []byte("ID=coreos")) {
-		output, err := exec.Command("sudo", "systemctl", "mask", "--now", "locksmithd").CombinedOutput()
+		output, err := exec.Command("systemctl", "mask", "--now", "locksmithd").CombinedOutput()
 		Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("should be able to mask locksmithd - output: %q", string(output)))
 		glog.Infof("Locksmithd is masked successfully")
 	}
--- a/test/e2e_node/remote/remote.go
+++ b/test/e2e_node/remote/remote.go
@@ -161,7 +161,7 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 		if err != nil {
 			return "", false, fmt.Errorf("could not find username: %v", err)
 		}
-		output, err := RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sudo", "usermod", "-a", "-G", "docker", uname.Username)
+		output, err := SSH(host, "usermod", "-a", "-G", "docker", uname.Username)
 		if err != nil {
 			return "", false, fmt.Errorf("instance %s not running docker daemon - Command failed: %s", host, output)
 		}
@@ -172,14 +172,15 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 	dirName := fmt.Sprintf("gcloud-e2e-%d", rand.Int31())
 	tmp := fmt.Sprintf("/tmp/%s", dirName)

-	_, err := RunSshCommand("ssh", GetHostnameOrIp(host), "--", "mkdir", tmp)
+	// Do not sudo here, so that we can use scp to copy test archive to the directdory.
+	_, err := SSHNoSudo(host, "mkdir", tmp)
 	if err != nil {
 		// Exit failure with the error
 		return "", false, err
 	}
 	if cleanup {
 		defer func() {
-			output, err := RunSshCommand("ssh", GetHostnameOrIp(host), "--", "rm", "-rf", tmp)
+			output, err := SSH(host, "rm", "-rf", tmp)
 			if err != nil {
 				glog.Errorf("failed to cleanup tmp directory %s on host %v.  Output:\n%s", tmp, err, output)
 			}
@@ -188,57 +189,62 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string

 	// Install the cni plugin.
 	cniPath := filepath.Join(tmp, CNIDirectory)
-	if _, err := RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c",
-		getSshCommand(" ; ", fmt.Sprintf("sudo mkdir -p %s", cniPath),
-			fmt.Sprintf("sudo wget -O - %s | sudo tar -xz -C %s", CNIURL, cniPath))); err != nil {
+	cmd := getSSHCommand(" ; ",
+		fmt.Sprintf("mkdir -p %s", cniPath),
+		fmt.Sprintf("wget -O - %s | tar -xz -C %s", CNIURL, cniPath),
+	)
+	if _, err := SSH(host, "sh", "-c", cmd); err != nil {
 		// Exit failure with the error
 		return "", false, err
 	}

 	// Configure iptables firewall rules
 	// TODO: consider calling bootstrap script to configure host based on OS
-	cmd := getSshCommand("&&",
+	cmd = getSSHCommand("&&",
 		`iptables -L INPUT | grep "Chain INPUT (policy DROP)"`,
 		"(iptables -C INPUT -w -p TCP -j ACCEPT || iptables -A INPUT -w -p TCP -j ACCEPT)",
 		"(iptables -C INPUT -w -p UDP -j ACCEPT || iptables -A INPUT -w -p UDP -j ACCEPT)",
 		"(iptables -C INPUT -w -p ICMP -j ACCEPT || iptables -A INPUT -w -p ICMP -j ACCEPT)")
-	output, err := RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sudo", "sh", "-c", cmd)
+	output, err := SSH(host, "sh", "-c", cmd)
 	if err != nil {
 		glog.Errorf("Failed to configured firewall: %v output: %v", err, output)
 	}
-	cmd = getSshCommand("&&",
+	cmd = getSSHCommand("&&",
 		`iptables -L FORWARD | grep "Chain FORWARD (policy DROP)" > /dev/null`,
 		"(iptables -C FORWARD -w -p TCP -j ACCEPT || iptables -A FORWARD -w -p TCP -j ACCEPT)",
 		"(iptables -C FORWARD -w -p UDP -j ACCEPT || iptables -A FORWARD -w -p UDP -j ACCEPT)",
 		"(iptables -C FORWARD -w -p ICMP -j ACCEPT || iptables -A FORWARD -w -p ICMP -j ACCEPT)")
-	output, err = RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sudo", "sh", "-c", cmd)
+	output, err = SSH(host, "sh", "-c", cmd)
 	if err != nil {
 		glog.Errorf("Failed to configured firewall: %v output: %v", err, output)
 	}

 	// Copy the archive to the staging directory
-	_, err = RunSshCommand("scp", archive, fmt.Sprintf("%s:%s/", GetHostnameOrIp(host), tmp))
+	_, err = runSSHCommand("scp", archive, fmt.Sprintf("%s:%s/", GetHostnameOrIp(host), tmp))
 	if err != nil {
 		// Exit failure with the error
 		return "", false, err
 	}

 	// Kill any running node processes
-	cmd = getSshCommand(" ; ",
-		"sudo pkill kubelet",
-		"sudo pkill kube-apiserver",
-		"sudo pkill etcd",
+	cmd = getSSHCommand(" ; ",
+		"pkill kubelet",
+		"pkill kube-apiserver",
+		"pkill etcd",
 	)
 	// No need to log an error if pkill fails since pkill will fail if the commands are not running.
 	// If we are unable to stop existing running k8s processes, we should see messages in the kubelet/apiserver/etcd
 	// logs about failing to bind the required ports.
 	glog.Infof("Killing any existing node processes on %s", host)
-	RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c", cmd)
+	SSH(host, "sh", "-c", cmd)

 	// Extract the archive
-	cmd = getSshCommand(" && ", fmt.Sprintf("cd %s", tmp), fmt.Sprintf("tar -xzvf ./%s", archiveName))
+	cmd = getSSHCommand(" && ",
+		fmt.Sprintf("cd %s", tmp),
+		fmt.Sprintf("tar -xzvf ./%s", archiveName),
+	)
 	glog.Infof("Extracting tar on %s", host)
-	output, err = RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c", cmd)
+	output, err = SSH(host, "sh", "-c", cmd)
 	if err != nil {
 		// Exit failure with the error
 		return "", false, err
@@ -261,7 +267,7 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 	}

 	// Determine if tests will run on a GCI node.
-	output, err = RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c", "'cat /etc/os-release'")
+	output, err = SSH(host, "sh", "-c", "'cat /etc/os-release'")
 	if err != nil {
 		glog.Errorf("Issue detecting node's OS via node's /etc/os-release. Err: %v, Output:\n%s", err, output)
 		return "", false, fmt.Errorf("Issue detecting node's OS via node's /etc/os-release. Err: %v, Output:\n%s", err, output)
@@ -270,7 +276,7 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 		// Note this implicitly requires the script to be where we expect in the tarball, so if that location changes the error
 		// here will tell us to update the remote test runner.
 		mounterPath := filepath.Join(tmp, "cluster/gce/gci/mounter/mounter")
-		output, err = RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c", fmt.Sprintf("'chmod 544 %s'", mounterPath))
+		output, err = SSH(host, "sh", "-c", fmt.Sprintf("'chmod 544 %s'", mounterPath))
 		if err != nil {
 			glog.Errorf("Unable to chmod 544 GCI mounter script. Err: %v, Output:\n%s", err, output)
 			return "", false, err
@@ -284,7 +290,7 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 	}

 	// Run the tests
-	cmd = getSshCommand(" && ",
+	cmd = getSSHCommand(" && ",
 		fmt.Sprintf("cd %s", tmp),
 		fmt.Sprintf("timeout -k 30s %fs ./ginkgo %s ./e2e_node.test -- --logtostderr --v 4 --node-name=%s --report-dir=%s/results --report-prefix=%s %s",
 			testTimeoutSeconds.Seconds(), ginkgoFlags, host, tmp, junitFilePrefix, testArgs),
@@ -292,7 +298,7 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 	aggErrs := []error{}

 	glog.Infof("Starting tests on %s", host)
-	output, err = RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c", cmd)
+	output, err = SSH(host, "sh", "-c", cmd)

 	if err != nil {
 		aggErrs = append(aggErrs, err)
@@ -313,10 +319,10 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 		// Try getting the system logs from journald and store it to a file.
 		// Don't reuse the original test directory on the remote host because
 		// it could've be been removed if the node was rebooted.
-		_, err := RunSshCommand("ssh", GetHostnameOrIp(host), "--", "sh", "-c", fmt.Sprintf("'sudo journalctl --system --all > %s'", logPath))
+		_, err := SSH(host, "sh", "-c", fmt.Sprintf("'journalctl --system --all > %s'", logPath))
 		if err == nil {
 			glog.Infof("Got the system logs from journald; copying it back...")
-			if _, err := RunSshCommand("scp", fmt.Sprintf("%s:%s", GetHostnameOrIp(host), logPath), destPath); err != nil {
+			if _, err := runSSHCommand("scp", fmt.Sprintf("%s:%s", GetHostnameOrIp(host), logPath), destPath); err != nil {
 				glog.Infof("Failed to copy the log: err: %v", err)
 			}
 		} else {
@@ -334,26 +340,38 @@ func RunRemote(archive string, host string, cleanup bool, junitFilePrefix string
 }

 func getTestArtifacts(host, testDir string) error {
-	_, err := RunSshCommand("scp", "-r", fmt.Sprintf("%s:%s/results/", GetHostnameOrIp(host), testDir), fmt.Sprintf("%s/%s", *resultsDir, host))
+	_, err := runSSHCommand("scp", "-r", fmt.Sprintf("%s:%s/results/", GetHostnameOrIp(host), testDir), fmt.Sprintf("%s/%s", *resultsDir, host))
 	if err != nil {
 		return err
 	}

 	// Copy junit to the top of artifacts
-	_, err = RunSshCommand("scp", fmt.Sprintf("%s:%s/results/junit*", GetHostnameOrIp(host), testDir), fmt.Sprintf("%s/", *resultsDir))
+	_, err = runSSHCommand("scp", fmt.Sprintf("%s:%s/results/junit*", GetHostnameOrIp(host), testDir), fmt.Sprintf("%s/", *resultsDir))
 	if err != nil {
 		return err
 	}
 	return nil
 }

-// getSshCommand handles proper quoting so that multiple commands are executed in the same shell over ssh
-func getSshCommand(sep string, args ...string) string {
+// getSSHCommand handles proper quoting so that multiple commands are executed in the same shell over ssh
+func getSSHCommand(sep string, args ...string) string {
 	return fmt.Sprintf("'%s'", strings.Join(args, sep))
 }

-// runSshCommand executes the ssh or scp command, adding the flag provided --ssh-options
-func RunSshCommand(cmd string, args ...string) (string, error) {
+// SSH executes ssh command with runSSHCommand as root. The `sudo` makes sure that all commands
+// are executed by root, so that there won't be permission mismatch between different commands.
+func SSH(host string, cmd ...string) (string, error) {
+	return runSSHCommand("ssh", append([]string{GetHostnameOrIp(host), "--", "sudo"}, cmd...)...)
+}
+
+// SSHNoSudo executes ssh command with runSSHCommand as normal user. Sometimes we need this,
+// for example creating a directory that we'll copy files there with scp.
+func SSHNoSudo(host string, cmd ...string) (string, error) {
+	return runSSHCommand("ssh", append([]string{GetHostnameOrIp(host), "--"}, cmd...)...)
+}
+
+// runSSHCommand executes the ssh or scp command, adding the flag provided --ssh-options
+func runSSHCommand(cmd string, args ...string) (string, error) {
 	if env, found := sshOptionsMap[*sshEnv]; found {
 		args = append(strings.Split(env, " "), args...)
 	}
--- a/test/e2e_node/runner/local/run_local.go
+++ b/test/e2e_node/runner/local/run_local.go
@@ -56,7 +56,7 @@ func main() {

 func runCommand(name string, args ...string) error {
 	glog.Infof("Running command: %v %v", name, strings.Join(args, " "))
-	cmd := exec.Command("sh", "-c", strings.Join(append([]string{name}, args...), " "))
+	cmd := exec.Command("sudo", "sh", "-c", strings.Join(append([]string{name}, args...), " "))
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	return cmd.Run()
--- a/test/e2e_node/runner/remote/run_remote.go
+++ b/test/e2e_node/runner/remote/run_remote.go
@@ -507,7 +507,7 @@ func createInstance(imageConfig *internalGCEImage) (string, error) {
 			remote.AddHostnameIp(name, externalIp)
 		}
 		var output string
-		output, err = remote.RunSshCommand("ssh", remote.GetHostnameOrIp(name), "--", "sudo", "docker", "version")
+		output, err = remote.SSH(name, "docker", "version")
 		if err != nil {
 			err = fmt.Errorf("instance %s not running docker daemon - Command failed: %s", name, output)
 			continue
--- a/test/e2e_node/services/server.go
+++ b/test/e2e_node/services/server.go
@@ -44,7 +44,7 @@ type server struct {
 	// startCommand is the command used to start the server
 	startCommand *exec.Cmd
 	// killCommand is the command used to stop the server. It is not required. If it
-	// is not specified, `sudo kill` will be used to stop the server.
+	// is not specified, `kill` will be used to stop the server.
 	killCommand *exec.Cmd
 	// restartCommand is the command used to restart the server. If provided, it will be used
 	// instead of startCommand when restarting the server.
@@ -338,19 +338,7 @@ func (s *server) kill() error {
 	const timeout = 10 * time.Second
 	for _, signal := range []string{"-TERM", "-KILL"} {
 		glog.V(2).Infof("Killing process %d (%s) with %s", pid, name, signal)
-		cmd := exec.Command("sudo", "kill", signal, strconv.Itoa(pid))
-
-		// Run the 'kill' command in a separate process group so sudo doesn't ignore it
-		attrs := &syscall.SysProcAttr{}
-		// Hack to set unix-only field without build tags.
-		setpgidField := reflect.ValueOf(attrs).Elem().FieldByName("Setpgid")
-		if setpgidField.IsValid() {
-			setpgidField.Set(reflect.ValueOf(true))
-		} else {
-			return fmt.Errorf("Failed to set Setpgid field (non-unix build)")
-		}
-		cmd.SysProcAttr = attrs
-
+		cmd := exec.Command("kill", signal, strconv.Itoa(pid))
 		_, err := cmd.Output()
 		if err != nil {
 			glog.Errorf("Error signaling process %d (%s) with %s: %v", pid, name, signal, err)
--- a/test/e2e_node/services/services.go
+++ b/test/e2e_node/services/services.go
@@ -78,16 +78,18 @@ func NewE2EServices(monitorParent bool) *E2EServices {
 // standard kubelet launcher)
 func (e *E2EServices) Start() error {
 	var err error
-	// Start kubelet
-	// Create the manifest path for kubelet.
-	// TODO(random-liu): Remove related logic when we move kubelet starting logic out of the test.
-	framework.TestContext.ManifestPath, err = ioutil.TempDir("", "node-e2e-pod")
-	if err != nil {
-		return fmt.Errorf("failed to create static pod manifest directory: %v", err)
-	}
-	e.kubelet, err = e.startKubelet()
-	if err != nil {
-		return fmt.Errorf("failed to start kubelet: %v", err)
+	if !framework.TestContext.NodeConformance {
+		// Start kubelet
+		// Create the manifest path for kubelet.
+		// TODO(random-liu): Remove related logic when we move kubelet starting logic out of the test.
+		framework.TestContext.ManifestPath, err = ioutil.TempDir("", "node-e2e-pod")
+		if err != nil {
+			return fmt.Errorf("failed to create static pod manifest directory: %v", err)
+		}
+		e.kubelet, err = e.startKubelet()
+		if err != nil {
+			return fmt.Errorf("failed to start kubelet: %v", err)
+		}
 	}
 	e.services, err = e.startInternalServices()
 	return err
@@ -96,14 +98,16 @@ func (e *E2EServices) Start() error {
 // Stop stops the e2e services.
 func (e *E2EServices) Stop() {
 	defer func() {
-		// Collect log files.
-		e.getLogFiles()
-		// Cleanup the manifest path for kubelet.
-		manifestPath := framework.TestContext.ManifestPath
-		if manifestPath != "" {
-			err := os.RemoveAll(manifestPath)
-			if err != nil {
-				glog.Errorf("Failed to delete static pod manifest directory %s: %v", manifestPath, err)
+		if !framework.TestContext.NodeConformance {
+			// Collect log files.
+			e.getLogFiles()
+			// Cleanup the manifest path for kubelet.
+			manifestPath := framework.TestContext.ManifestPath
+			if manifestPath != "" {
+				err := os.RemoveAll(manifestPath)
+				if err != nil {
+					glog.Errorf("Failed to delete static pod manifest directory %s: %v", manifestPath, err)
+				}
 			}
 		}
 	}()
@@ -144,17 +148,8 @@ func (e *E2EServices) startInternalServices() (*server, error) {
 	if err != nil {
 		return nil, fmt.Errorf("can't get current binary: %v", err)
 	}
-	startCmd := exec.Command("sudo", testBin,
-		// TODO(mtaufen): Flags e.g. that target the TestContext need to be manually forwarded to the
-		//                test binary when we start it up in run-services mode. This is not ideal.
-		//                Very unintuitive because it prevents any flags NOT manually forwarded here
-		//                from being set via TEST_ARGS when running tests from the command line.
-		"--run-services-mode",
-		"--server-start-timeout", serverStartTimeout.String(),
-		"--feature-gates", framework.TestContext.FeatureGates,
-		"--logtostderr",
-		"--vmodule=*="+LOG_VERBOSITY_LEVEL,
-	)
+	// Pass all flags into the child process, so that it will see the same flag set.
+	startCmd := exec.Command(testBin, append([]string{"--run-services-mode"}, os.Args[1:]...)...)
 	server := newServer("services", startCmd, nil, nil, getServicesHealthCheckURLs(), servicesLogFile, e.monitorParent, false)
 	return server, server.start()
 }
@@ -180,8 +175,8 @@ func (e *E2EServices) startKubelet() (*server, error) {
 		// sense to test it that way
 		unitName := fmt.Sprintf("kubelet-%d.service", rand.Int31())
 		cmdArgs = append(cmdArgs, systemdRun, "--unit="+unitName, "--remain-after-exit", builder.GetKubeletServerBin())
-		killCommand = exec.Command("sudo", "systemctl", "kill", unitName)
-		restartCommand = exec.Command("sudo", "systemctl", "restart", unitName)
+		killCommand = exec.Command("systemctl", "kill", unitName)
+		restartCommand = exec.Command("systemctl", "restart", unitName)
 		e.logFiles["kubelet.log"] = logFileData{
 			journalctlCommand: []string{"-u", unitName},
 		}
@@ -246,7 +241,7 @@ func (e *E2EServices) startKubelet() (*server, error) {
 			"--network-plugin-dir", filepath.Join(cwd, "cni", "bin")) // Enable kubenet
 	}

-	cmd := exec.Command("sudo", cmdArgs...)
+	cmd := exec.Command(cmdArgs[0], cmdArgs[1:]...)
 	server := newServer(
 		"kubelet",
 		cmd,
@@ -281,7 +276,7 @@ func (e *E2EServices) getLogFiles() {
 				continue
 			}
 			glog.Infof("Get log file %q with journalctl command %v.", targetFileName, logFileData.journalctlCommand)
-			out, err := exec.Command("sudo", append([]string{"journalctl"}, logFileData.journalctlCommand...)...).CombinedOutput()
+			out, err := exec.Command("journalctl", logFileData.journalctlCommand...).CombinedOutput()
 			if err != nil {
 				glog.Errorf("failed to get %q from journald: %v, %v", targetFileName, string(out), err)
 			} else {
@@ -314,10 +309,10 @@ func isJournaldAvailable() bool {

 func copyLogFile(src, target string) error {
 	// If not a journald based distro, then just symlink files.
-	if out, err := exec.Command("sudo", "cp", src, target).CombinedOutput(); err != nil {
+	if out, err := exec.Command("cp", src, target).CombinedOutput(); err != nil {
 		return fmt.Errorf("failed to copy %q to %q: %v, %v", src, target, out, err)
 	}
-	if out, err := exec.Command("sudo", "chmod", "a+r", target).CombinedOutput(); err != nil {
+	if out, err := exec.Command("chmod", "a+r", target).CombinedOutput(); err != nil {
 		return fmt.Errorf("failed to make log file %q world readable: %v, %v", target, out, err)
 	}
 	return nil
--- a/test/e2e_node/system/kernel_validator.go
+++ b/test/e2e_node/system/kernel_validator.go
@@ -208,14 +208,13 @@ func (k *KernelValidator) getKernelConfigReader() (io.Reader, error) {
 		}
 		// If the kernel config file is not found, try to load the kernel
 		// config module and check again.
-		// TODO(random-liu): Remove "sudo" in containerize test PR #31093
-		output, err := exec.Command("sudo", modprobeCmd, configsModule).CombinedOutput()
+		output, err := exec.Command(modprobeCmd, configsModule).CombinedOutput()
 		if err != nil {
 			return nil, fmt.Errorf("unable to load kernel module %q: output - %q, err - %v",
 				configsModule, output, err)
 		}
 		// Unload the kernel config module to make sure the validation have no side effect.
-		defer exec.Command("sudo", modprobeCmd, "-r", configsModule).Run()
+		defer exec.Command(modprobeCmd, "-r", configsModule).Run()
 		loadModule = true
 	}
 	return nil, fmt.Errorf("no config path in %v is available", possibePaths)