From a7469cf680453df35a674d0cb686643fee4ce3d5 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Tue, 29 Jun 2021 21:07:10 +0200 Subject: [PATCH] sort and filter exposed Pod IPs runtimes may return an arbitrary number of Pod IPs, however, kubernetes only takes into consideration the first one of each IP family. The order of the IPs are the one defined by the Kubelet: - default prefer IPv4 - if NodeIPs are defined, matching the first nodeIP family PodIP is always the first IP of PodIPs. The downward API must expose the same IPs and in the same order than the pod.Status API object. --- pkg/kubelet/kubelet_pods.go | 58 ++++++--- pkg/kubelet/kubelet_pods_test.go | 217 ++++++++++++++++++++++++++++++- 2 files changed, 255 insertions(+), 20 deletions(-) diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index f78fde8ee3c..f3ab4de692f 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -809,6 +809,13 @@ func (kl *Kubelet) podFieldSelectorRuntimeValue(fs *v1.ObjectFieldSelector, pod if err != nil { return "", err } + + // make podIPs order match node IP family preference #97979 + podIPs = kl.sortPodIPs(podIPs) + if len(podIPs) > 0 { + podIP = podIPs[0] + } + switch internalFieldPath { case "spec.nodeName": return pod.Spec.NodeName, nil @@ -1570,16 +1577,14 @@ func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.Po return *s } -// convertStatusToAPIStatus creates an api PodStatus for the given pod from -// the given internal pod status. It is purely transformative and does not -// alter the kubelet state at all. -func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *v1.PodStatus { - var apiPodStatus v1.PodStatus - - // The runtime pod status may have an arbitrary number of IPs, in an arbitrary - // order. Pick out the first returned IP of the same IP family as the node IP - // first, followed by the first IP of the opposite IP family (if any). - podIPs := make([]v1.PodIP, 0, len(podStatus.IPs)) +// sortPodIPs return the PodIPs sorted and truncated by the cluster IP family preference. +// The runtime pod status may have an arbitrary number of IPs, in an arbitrary order. +// PodIPs are obtained by: func (m *kubeGenericRuntimeManager) determinePodSandboxIPs() +// Pick out the first returned IP of the same IP family as the node IP +// first, followed by the first IP of the opposite IP family (if any) +// and use them for the Pod.Status.PodIPs and the Downward API environment variables +func (kl *Kubelet) sortPodIPs(podIPs []string) []string { + ips := make([]string, 0, 2) var validPrimaryIP, validSecondaryIP func(ip string) bool if len(kl.nodeIPs) == 0 || utilnet.IsIPv4(kl.nodeIPs[0]) { validPrimaryIP = utilnet.IsIPv4String @@ -1588,21 +1593,40 @@ func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontaine validPrimaryIP = utilnet.IsIPv6String validSecondaryIP = utilnet.IsIPv4String } - for _, ip := range podStatus.IPs { + for _, ip := range podIPs { if validPrimaryIP(ip) { - podIPs = append(podIPs, v1.PodIP{IP: ip}) + ips = append(ips, ip) break } } - for _, ip := range podStatus.IPs { + for _, ip := range podIPs { if validSecondaryIP(ip) { - podIPs = append(podIPs, v1.PodIP{IP: ip}) + ips = append(ips, ip) break } } - apiPodStatus.PodIPs = podIPs - if len(podIPs) > 0 { - apiPodStatus.PodIP = podIPs[0].IP + return ips +} + +// convertStatusToAPIStatus creates an api PodStatus for the given pod from +// the given internal pod status. It is purely transformative and does not +// alter the kubelet state at all. +func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus) *v1.PodStatus { + var apiPodStatus v1.PodStatus + + // copy pod status IPs to avoid race conditions with PodStatus #102806 + podIPs := make([]string, len(podStatus.IPs)) + for j, ip := range podStatus.IPs { + podIPs[j] = ip + } + + // make podIPs order match node IP family preference #97979 + podIPs = kl.sortPodIPs(podIPs) + for _, ip := range podIPs { + apiPodStatus.PodIPs = append(apiPodStatus.PodIPs, v1.PodIP{IP: ip}) + } + if len(apiPodStatus.PodIPs) > 0 { + apiPodStatus.PodIP = apiPodStatus.PodIPs[0].IP } // set status for Pods created on versions of kube older than 1.6 diff --git a/pkg/kubelet/kubelet_pods_test.go b/pkg/kubelet/kubelet_pods_test.go index 102761b5768..0bb28d26a02 100644 --- a/pkg/kubelet/kubelet_pods_test.go +++ b/pkg/kubelet/kubelet_pods_test.go @@ -461,6 +461,7 @@ func TestMakeEnvironmentVariables(t *testing.T) { unsyncedServices bool // whether the services should NOT be synced configMap *v1.ConfigMap // an optional ConfigMap to pull from secret *v1.Secret // an optional Secret to pull from + podIPs []string // the pod IPs expectedEnvs []kubecontainer.EnvVar // a set of expected environment vars expectedError bool // does the test fail expectedEvent string // does the test emit an event @@ -766,6 +767,7 @@ func TestMakeEnvironmentVariables(t *testing.T) { }, }, }, + podIPs: []string{"1.2.3.4", "fd00::6"}, masterServiceNs: "nothing", nilLister: true, expectedEnvs: []kubecontainer.EnvVar{ @@ -778,6 +780,94 @@ func TestMakeEnvironmentVariables(t *testing.T) { {Name: "HOST_IP", Value: testKubeletHostIP}, }, }, + { + name: "downward api pod ips reverse order", + ns: "downward-api", + enableServiceLinks: &falseValue, + container: &v1.Container{ + Env: []v1.EnvVar{ + { + Name: "POD_IP", + ValueFrom: &v1.EnvVarSource{ + FieldRef: &v1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "status.podIP", + }, + }, + }, + { + Name: "POD_IPS", + ValueFrom: &v1.EnvVarSource{ + FieldRef: &v1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "status.podIPs", + }, + }, + }, + { + Name: "HOST_IP", + ValueFrom: &v1.EnvVarSource{ + FieldRef: &v1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "status.hostIP", + }, + }, + }, + }, + }, + podIPs: []string{"fd00::6", "1.2.3.4"}, + masterServiceNs: "nothing", + nilLister: true, + expectedEnvs: []kubecontainer.EnvVar{ + {Name: "POD_IP", Value: "1.2.3.4"}, + {Name: "POD_IPS", Value: "1.2.3.4,fd00::6"}, + {Name: "HOST_IP", Value: testKubeletHostIP}, + }, + }, + { + name: "downward api pod ips multiple ips", + ns: "downward-api", + enableServiceLinks: &falseValue, + container: &v1.Container{ + Env: []v1.EnvVar{ + { + Name: "POD_IP", + ValueFrom: &v1.EnvVarSource{ + FieldRef: &v1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "status.podIP", + }, + }, + }, + { + Name: "POD_IPS", + ValueFrom: &v1.EnvVarSource{ + FieldRef: &v1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "status.podIPs", + }, + }, + }, + { + Name: "HOST_IP", + ValueFrom: &v1.EnvVarSource{ + FieldRef: &v1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "status.hostIP", + }, + }, + }, + }, + }, + podIPs: []string{"1.2.3.4", "192.168.1.1.", "fd00::6"}, + masterServiceNs: "nothing", + nilLister: true, + expectedEnvs: []kubecontainer.EnvVar{ + {Name: "POD_IP", Value: "1.2.3.4"}, + {Name: "POD_IPS", Value: "1.2.3.4,fd00::6"}, + {Name: "HOST_IP", Value: testKubeletHostIP}, + }, + }, { name: "env expansion", ns: "test1", @@ -1685,13 +1775,15 @@ func TestMakeEnvironmentVariables(t *testing.T) { EnableServiceLinks: tc.enableServiceLinks, }, } - podIP := "1.2.3.4" - podIPs := []string{"1.2.3.4,fd00::6"} + podIP := "" + if len(tc.podIPs) > 0 { + podIP = tc.podIPs[0] + } if tc.staticPod { testPod.Annotations[kubetypes.ConfigSourceAnnotationKey] = "file" } - result, err := kl.makeEnvironmentVariables(testPod, tc.container, podIP, podIPs) + result, err := kl.makeEnvironmentVariables(testPod, tc.container, podIP, tc.podIPs) select { case e := <-fakeRecorder.Events: assert.Equal(t, tc.expectedEvent, e) @@ -2837,6 +2929,33 @@ func TestGenerateAPIPodStatusHostNetworkPodIPs(t *testing.T) { {IP: "192.168.0.1"}, }, }, + { + name: "CRI dual-stack PodIPs override NodeAddresses", + nodeAddresses: []v1.NodeAddress{ + {Type: v1.NodeInternalIP, Address: "10.0.0.1"}, + {Type: v1.NodeInternalIP, Address: "fd01::1234"}, + }, + dualStack: true, + criPodIPs: []string{"192.168.0.1", "2001:db8::2"}, + podIPs: []v1.PodIP{ + {IP: "192.168.0.1"}, + {IP: "2001:db8::2"}, + }, + }, + { + // by default the cluster prefers IPv4 + name: "CRI dual-stack PodIPs override NodeAddresses prefer IPv4", + nodeAddresses: []v1.NodeAddress{ + {Type: v1.NodeInternalIP, Address: "10.0.0.1"}, + {Type: v1.NodeInternalIP, Address: "fd01::1234"}, + }, + dualStack: true, + criPodIPs: []string{"2001:db8::2", "192.168.0.1"}, + podIPs: []v1.PodIP{ + {IP: "192.168.0.1"}, + {IP: "2001:db8::2"}, + }, + }, } for _, tc := range testcases { @@ -3009,3 +3128,95 @@ func TestGenerateAPIPodStatusPodIPs(t *testing.T) { }) } } + +func TestSortPodIPs(t *testing.T) { + testcases := []struct { + name string + nodeIP string + podIPs []string + expectedIPs []string + }{ + { + name: "Simple", + nodeIP: "", + podIPs: []string{"10.0.0.1"}, + expectedIPs: []string{"10.0.0.1"}, + }, + { + name: "Dual-stack", + nodeIP: "", + podIPs: []string{"10.0.0.1", "fd01::1234"}, + expectedIPs: []string{"10.0.0.1", "fd01::1234"}, + }, + { + name: "Dual-stack with explicit node IP", + nodeIP: "192.168.1.1", + podIPs: []string{"10.0.0.1", "fd01::1234"}, + expectedIPs: []string{"10.0.0.1", "fd01::1234"}, + }, + { + name: "Dual-stack with CRI returning wrong family first", + nodeIP: "", + podIPs: []string{"fd01::1234", "10.0.0.1"}, + expectedIPs: []string{"10.0.0.1", "fd01::1234"}, + }, + { + name: "Dual-stack with explicit node IP with CRI returning wrong family first", + nodeIP: "192.168.1.1", + podIPs: []string{"fd01::1234", "10.0.0.1"}, + expectedIPs: []string{"10.0.0.1", "fd01::1234"}, + }, + { + name: "Dual-stack with IPv6 node IP", + nodeIP: "fd00::5678", + podIPs: []string{"10.0.0.1", "fd01::1234"}, + expectedIPs: []string{"fd01::1234", "10.0.0.1"}, + }, + { + name: "Dual-stack with IPv6 node IP, other CRI order", + nodeIP: "fd00::5678", + podIPs: []string{"fd01::1234", "10.0.0.1"}, + expectedIPs: []string{"fd01::1234", "10.0.0.1"}, + }, + { + name: "No Pod IP matching Node IP", + nodeIP: "fd00::5678", + podIPs: []string{"10.0.0.1"}, + expectedIPs: []string{"10.0.0.1"}, + }, + { + name: "No Pod IP matching (unspecified) Node IP", + nodeIP: "", + podIPs: []string{"fd01::1234"}, + expectedIPs: []string{"fd01::1234"}, + }, + { + name: "Multiple IPv4 IPs", + nodeIP: "", + podIPs: []string{"10.0.0.1", "10.0.0.2", "10.0.0.3"}, + expectedIPs: []string{"10.0.0.1"}, + }, + { + name: "Multiple Dual-Stack IPs", + nodeIP: "", + podIPs: []string{"10.0.0.1", "10.0.0.2", "fd01::1234", "10.0.0.3", "fd01::5678"}, + expectedIPs: []string{"10.0.0.1", "fd01::1234"}, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) + defer testKubelet.Cleanup() + kl := testKubelet.kubelet + if tc.nodeIP != "" { + kl.nodeIPs = []net.IP{net.ParseIP(tc.nodeIP)} + } + + podIPs := kl.sortPodIPs(tc.podIPs) + if !reflect.DeepEqual(podIPs, tc.expectedIPs) { + t.Fatalf("Expected PodIPs %#v, got %#v", tc.expectedIPs, podIPs) + } + }) + } +}