Adding kubelet metrics for started and failed to start HostProcess containers

Signed-off-by: Mark Rossetti <marosset@microsoft.com>
2026-01-05 15:37:24 +00:00 · 2021-09-30 10:33:15 -07:00
parent 6d30c96d4a
commit ef324d6bbd
3 changed files with 181 additions and 13 deletions
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@@ -55,6 +55,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/kubernetes/pkg/kubelet/util/cache"
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
+	sc "k8s.io/kubernetes/pkg/securitycontext"
 )

 const (
@@ -883,12 +884,18 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, podStatus *kubecontaine
 		}

 		metrics.StartedContainersTotal.WithLabelValues(metricLabel).Inc()
+		if sc.HasWindowsHostProcessRequest(pod, spec.container) {
+			metrics.StartedHostProcessContainersTotal.WithLabelValues(metricLabel).Inc()
+		}
 		klog.V(4).InfoS("Creating container in pod", "containerType", typeName, "container", spec.container, "pod", klog.KObj(pod))
 		// NOTE (aramase) podIPs are populated for single stack and dual stack clusters. Send only podIPs.
 		if msg, err := m.startContainer(podSandboxID, podSandboxConfig, spec, pod, podStatus, pullSecrets, podIP, podIPs); err != nil {
 			// startContainer() returns well-defined error codes that have reasonable cardinality for metrics and are
 			// useful to cluster administrators to distinguish "server errors" from "user errors".
 			metrics.StartedContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc()
+			if sc.HasWindowsHostProcessRequest(pod, spec.container) {
+				metrics.StartedHostProcessContainersErrorsTotal.WithLabelValues(metricLabel, err.Error()).Inc()
+			}
 			startContainerResult.Fail(err, msg)
 			// known errors that are logged in other places are logged at higher levels here to avoid
 			// repetitive log spam
--- a/pkg/kubelet/metrics/metrics.go
+++ b/pkg/kubelet/metrics/metrics.go
@@ -90,6 +90,10 @@ const (
 	StartedContainersTotalKey       = "started_containers_total"
 	StartedContainersErrorsTotalKey = "started_containers_errors_total"

+	// Metrics to track HostProcess container usage by this kubelet
+	StartedHostProcessContainersTotalKey       = "started_host_process_containers_total"
+	StartedHostProcessContainersErrorsTotalKey = "started_host_process_containers_errors_total"
+
 	// Metrics to track ephemeral container usage by this kubelet
 	ManagedEphemeralContainersKey = "managed_ephemeral_containers"

@@ -488,6 +492,26 @@ var (
 		},
 		[]string{"container_type", "code"},
 	)
+	// StartedHostProcessContainersTotal is a counter that tracks the number of hostprocess container creation operations
+	StartedHostProcessContainersTotal = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem:      KubeletSubsystem,
+			Name:           StartedHostProcessContainersTotalKey,
+			Help:           "Cumulative number of hostprocess containers started. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.",
+			StabilityLevel: metrics.ALPHA,
+		},
+		[]string{"container_type"},
+	)
+	// StartedHostProcessContainersErrorsTotal is a counter that tracks the number of errors creating hostprocess containers
+	StartedHostProcessContainersErrorsTotal = metrics.NewCounterVec(
+		&metrics.CounterOpts{
+			Subsystem:      KubeletSubsystem,
+			Name:           StartedHostProcessContainersErrorsTotalKey,
+			Help:           "Cumulative number of errors when starting hostprocess containers. This metric will only be collected on Windows and requires WindowsHostProcessContainers feature gate to be enabled.",
+			StabilityLevel: metrics.ALPHA,
+		},
+		[]string{"container_type", "code"},
+	)
 	// ManagedEphemeralContainers is a gauge that indicates how many ephemeral containers are managed by this kubelet.
 	ManagedEphemeralContainers = metrics.NewGauge(
 		&metrics.GaugeOpts{
@@ -530,6 +554,10 @@ func Register(collectors ...metrics.StableCollector) {
 		legacyregistry.MustRegister(StartedPodsErrorsTotal)
 		legacyregistry.MustRegister(StartedContainersTotal)
 		legacyregistry.MustRegister(StartedContainersErrorsTotal)
+		if utilfeature.DefaultFeatureGate.Enabled(features.WindowsHostProcessContainers) {
+			legacyregistry.MustRegister(StartedHostProcessContainersTotal)
+			legacyregistry.MustRegister(StartedHostProcessContainersErrorsTotal)
+		}
 		legacyregistry.MustRegister(RunPodSandboxDuration)
 		legacyregistry.MustRegister(RunPodSandboxErrors)
 		if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {
--- a/test/e2e/windows/host_process.go
+++ b/test/e2e/windows/host_process.go
@@ -29,6 +29,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/test/e2e/framework"
+	e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
 	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
 	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
 	imageutils "k8s.io/kubernetes/test/utils/image"
@@ -69,6 +70,13 @@ const (
 	Write-Output "SUCCESS"`
 )

+var (
+	trueVar = true
+
+	User_NTAuthorityLocalService = "NT AUTHORITY\\Local Service"
+	User_NTAuthoritySystem       = "NT AUTHORITY\\SYSTEM"
+)
+
 var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDocker] [MinimumKubeletVersion:1.22] HostProcess containers", func() {
 	ginkgo.BeforeEach(func() {
 		e2eskipper.SkipUnlessNodeOSDistroIs("windows")
@@ -86,10 +94,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc

 		ginkgo.By("scheduling a pod with a container that verifies %COMPUTERNAME% matches selected node name")
 		image := imageutils.GetConfig(imageutils.BusyBox)
-
-		trueVar := true
 		podName := "host-process-test-pod"
-		user := "NT AUTHORITY\\Local service"
 		pod := &v1.Pod{
 			ObjectMeta: metav1.ObjectMeta{
 				Name: podName,
@@ -98,7 +103,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
 				SecurityContext: &v1.PodSecurityContext{
 					WindowsOptions: &v1.WindowsSecurityContextOptions{
 						HostProcess:   &trueVar,
-						RunAsUserName: &user,
+						RunAsUserName: &User_NTAuthoritySystem,
 					},
 				},
 				HostNetwork: true,
@@ -131,9 +136,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc

 	ginkgo.It("should support init containers", func() {
 		ginkgo.By("scheduling a pod with a container that verifies init container can configure the node")
-		trueVar := true
 		podName := "host-process-init-pods"
-		user := "NT AUTHORITY\\SYSTEM"
 		filename := fmt.Sprintf("/testfile%s.txt", string(uuid.NewUUID()))
 		pod := &v1.Pod{
 			ObjectMeta: metav1.ObjectMeta{
@@ -143,7 +146,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
 				SecurityContext: &v1.PodSecurityContext{
 					WindowsOptions: &v1.WindowsSecurityContextOptions{
 						HostProcess:   &trueVar,
-						RunAsUserName: &user,
+						RunAsUserName: &User_NTAuthoritySystem,
 					},
 				},
 				HostNetwork: true,
@@ -364,8 +367,6 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
 				containers = append(containers, container)
 			}

-			trueVar := true
-			user := "NT AUTHORITY\\Local Service"
 			pod := &v1.Pod{
 				ObjectMeta: metav1.ObjectMeta{
 					Name: podName,
@@ -374,7 +375,7 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
 					SecurityContext: &v1.PodSecurityContext{
 						WindowsOptions: &v1.WindowsSecurityContextOptions{
 							HostProcess:   &trueVar,
-							RunAsUserName: &user,
+							RunAsUserName: &User_NTAuthorityLocalService,
 						},
 					},
 					HostNetwork:   true,
@@ -479,11 +480,103 @@ var _ = SIGDescribe("[Feature:WindowsHostProcessContainers] [Excluded:WindowsDoc
 		framework.ExpectEqual(p.Status.Phase, v1.PodSucceeded)
 	})

+	ginkgo.It("metrics should report count of started and failed to start HostProcess containers", func() {
+		ginkgo.By("Selecting a Windows node")
+		targetNode, err := findWindowsNode(f)
+		framework.ExpectNoError(err, "Error finding Windows node")
+		framework.Logf("Using node: %v", targetNode.Name)
+
+		ginkgo.By("Getting initial kubelet metrics values")
+		beforeMetrics, err := getCurrentHostProcessMetrics(f, targetNode.Name)
+		framework.ExpectNoError(err, "Error getting initial kubelet metrics for node")
+		framework.Logf("Initial HostProcess container metrics -- StartedContainers: %v, StartedContainersErrors: %v, StartedInitContainers: %v, StartedInitContainersErrors: %v",
+			beforeMetrics.StartedContainersCount, beforeMetrics.StartedContainersErrorCount, beforeMetrics.StartedInitContainersCount, beforeMetrics.StartedInitContainersErrorCount)
+
+		ginkgo.By("Scheduling a pod with a HostProcess init container that will fail")
+
+		podName := "host-process-metrics-pod-failing-init-container"
+		pod := &v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: podName,
+			},
+			Spec: v1.PodSpec{
+				SecurityContext: &v1.PodSecurityContext{
+					WindowsOptions: &v1.WindowsSecurityContextOptions{
+						HostProcess:   &trueVar,
+						RunAsUserName: &User_NTAuthoritySystem,
+					},
+				},
+				HostNetwork: true,
+				InitContainers: []v1.Container{
+					{
+						Image:   imageutils.GetE2EImage(imageutils.BusyBox),
+						Name:    "failing-init-container",
+						Command: []string{"foobar.exe"},
+					},
+				},
+				Containers: []v1.Container{
+					{
+						Image:   imageutils.GetE2EImage(imageutils.BusyBox),
+						Name:    "container",
+						Command: []string{"cmd.exe", "/c", "exit", "/b", "0"},
+					},
+				},
+				RestartPolicy: v1.RestartPolicyNever,
+				NodeName:      targetNode.Name,
+			},
+		}
+
+		f.PodClient().Create(pod)
+		f.PodClient().WaitForFinish(podName, 3*time.Minute)
+
+		ginkgo.By("Scheduling a pod with a HostProcess container that will fail")
+		podName = "host-process-metrics-pod-failing-container"
+		pod = &v1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: podName,
+			},
+			Spec: v1.PodSpec{
+				SecurityContext: &v1.PodSecurityContext{
+					WindowsOptions: &v1.WindowsSecurityContextOptions{
+						HostProcess:   &trueVar,
+						RunAsUserName: &User_NTAuthoritySystem,
+					},
+				},
+				HostNetwork: true,
+				Containers: []v1.Container{
+					{
+						Image:   imageutils.GetE2EImage(imageutils.BusyBox),
+						Name:    "failing-container",
+						Command: []string{"foobar.exe"},
+					},
+				},
+				RestartPolicy: v1.RestartPolicyNever,
+				NodeName:      targetNode.Name,
+			},
+		}
+
+		f.PodClient().Create(pod)
+		f.PodClient().WaitForFinish(podName, 3*time.Minute)
+
+		ginkgo.By("Getting subsequent kubelet metrics values")
+
+		afterMetrics, err := getCurrentHostProcessMetrics(f, targetNode.Name)
+		framework.ExpectNoError(err, "Error getting subsequent kubelet metrics for node")
+		framework.Logf("Subsequent HostProcess container metrics -- StartedContainers: %v, StartedContainersErrors: %v, StartedInitContainers: %v, StartedInitContainersErrors: %v",
+			afterMetrics.StartedContainersCount, afterMetrics.StartedContainersErrorCount, afterMetrics.StartedInitContainersCount, afterMetrics.StartedInitContainersErrorCount)
+
+		// Note: This test performs relative comparisons to ensure metrics values were logged and does not validate specific values.
+		// This done so the test can be run in parallel with other tests which may start HostProcess containers on the same node.
+		ginkgo.By("Ensuring metrics were updated")
+		framework.ExpectEqual(beforeMetrics.StartedContainersCount < afterMetrics.StartedContainersCount, true, "Count of started HostProcess containers should increase")
+		framework.ExpectEqual(beforeMetrics.StartedContainersErrorCount < afterMetrics.StartedContainersErrorCount, true, "Count of started HostProcess errors containers should increase")
+		framework.ExpectEqual(beforeMetrics.StartedInitContainersCount < afterMetrics.StartedInitContainersCount, true, "Count of started HostProcess init containers should increase")
+		framework.ExpectEqual(beforeMetrics.StartedInitContainersErrorCount < afterMetrics.StartedInitContainersErrorCount, true, "Count of started HostProcess errors init containers should increase")
+	})
+
 })

 func makeTestPodWithVolumeMounts(name string) *v1.Pod {
-	trueVar := true
-	username := "NT AUTHORITY\\SYSTEM"
 	hostPathDirectoryOrCreate := v1.HostPathDirectoryOrCreate
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
@@ -493,7 +586,7 @@ func makeTestPodWithVolumeMounts(name string) *v1.Pod {
 			SecurityContext: &v1.PodSecurityContext{
 				WindowsOptions: &v1.WindowsSecurityContextOptions{
 					HostProcess:   &trueVar,
-					RunAsUserName: &username,
+					RunAsUserName: &User_NTAuthoritySystem,
 				},
 			},
 			HostNetwork: true,
@@ -612,6 +705,46 @@ func makeTestPodWithVolumeMounts(name string) *v1.Pod {
 	}
 }

+type HostProcessContainersMetrics struct {
+	StartedContainersCount          int64
+	StartedContainersErrorCount     int64
+	StartedInitContainersCount      int64
+	StartedInitContainersErrorCount int64
+}
+
+// getCurrentHostProcessMetrics returns a HostPRocessContainersMetrics object. Any metrics that do not have any
+// values reported will be set to 0.
+func getCurrentHostProcessMetrics(f *framework.Framework, nodeName string) (HostProcessContainersMetrics, error) {
+	var result HostProcessContainersMetrics
+
+	metrics, err := e2emetrics.GetKubeletMetrics(f.ClientSet, nodeName)
+	if err != nil {
+		return result, err
+	}
+
+	for _, sample := range metrics["started_host_process_containers_total"] {
+		switch sample.Metric["container_type"] {
+		case "container":
+			result.StartedContainersCount = int64(sample.Value)
+		case "init_container":
+			result.StartedInitContainersCount = int64(sample.Value)
+		}
+	}
+
+	// note: accumulate failures of all types (ErrImagePull, RunContainerError, etc)
+	// for each container type here.
+	for _, sample := range metrics["started_host_process_containers_errors_total"] {
+		switch sample.Metric["container_type"] {
+		case "container":
+			result.StartedContainersErrorCount += int64(sample.Value)
+		case "init_container":
+			result.StartedInitContainersErrorCount += int64(sample.Value)
+		}
+	}
+
+	return result, nil
+}
+
 func SkipUnlessWindowsHostProcessContainersEnabled() {
 	if !framework.TestContext.FeatureGates[string(features.WindowsHostProcessContainers)] {
 		e2eskipper.Skipf("Skipping test because feature 'WindowsHostProcessContainers' is not enabled")