mirror of
https://github.com/k3s-io/kubernetes.git
synced 2026-01-25 14:35:43 +00:00
Fix endpoints status out-of-sync when the pod state changes rapidly
When Pod state changes rapidly, endpoints controller may use outdated informer cache to sync Service. If the outdated endpoints appear to be expected by the controller, it skips updating it. The commit fixes it by checking if endpoints informer cache is outdated when processing a service. If the endpoints is stale, it returns an error and retries later. Signed-off-by: Quan Tian <quan.tian@broadcom.com>
This commit is contained in:
@@ -17,8 +17,10 @@ limitations under the License.
|
||||
package endpoints
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -158,6 +160,147 @@ func TestEndpointUpdates(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
// Regression test for https://issues.k8s.io/125638
|
||||
func TestEndpointWithMultiplePodUpdates(t *testing.T) {
|
||||
// Disable ServiceAccount admission plugin as we don't have serviceaccount controller running.
|
||||
server := kubeapiservertesting.StartTestServerOrDie(t, nil, framework.DefaultTestServerFlags(), framework.SharedEtcd())
|
||||
defer server.TearDownFn()
|
||||
|
||||
client, err := clientset.NewForConfig(server.ClientConfig)
|
||||
if err != nil {
|
||||
t.Fatalf("Error creating clientset: %v", err)
|
||||
}
|
||||
|
||||
informers := informers.NewSharedInformerFactory(client, 0)
|
||||
|
||||
tCtx := ktesting.Init(t)
|
||||
epController := endpoint.NewEndpointController(
|
||||
tCtx,
|
||||
informers.Core().V1().Pods(),
|
||||
informers.Core().V1().Services(),
|
||||
informers.Core().V1().Endpoints(),
|
||||
client,
|
||||
0)
|
||||
|
||||
// Process 10 services in parallel to increase likelihood of outdated informer cache.
|
||||
concurrency := 10
|
||||
// Start informer and controllers
|
||||
informers.Start(tCtx.Done())
|
||||
go epController.Run(tCtx, concurrency)
|
||||
|
||||
// Create namespace
|
||||
ns := framework.CreateNamespaceOrDie(client, "test-endpoints-updates", t)
|
||||
defer framework.DeleteNamespaceOrDie(client, ns, t)
|
||||
|
||||
// Create a pod with labels
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "test-pod",
|
||||
Namespace: ns.Name,
|
||||
Labels: labelMap(),
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
NodeName: "fakenode",
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Name: "fake-name",
|
||||
Image: "fakeimage",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
pod, err = client.CoreV1().Pods(ns.Name).Create(tCtx, pod, metav1.CreateOptions{})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create pod %s: %v", pod.Name, err)
|
||||
}
|
||||
|
||||
// Set pod status
|
||||
pod.Status = v1.PodStatus{
|
||||
Phase: v1.PodRunning,
|
||||
Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue}},
|
||||
PodIPs: []v1.PodIP{{IP: "1.1.1.1"}},
|
||||
}
|
||||
pod, err = client.CoreV1().Pods(ns.Name).UpdateStatus(tCtx, pod, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to update status of pod %s: %v", pod.Name, err)
|
||||
}
|
||||
|
||||
var services []*v1.Service
|
||||
// Create services associated to the pod
|
||||
for i := 0; i < concurrency; i++ {
|
||||
svc := newService(ns.Name, fmt.Sprintf("foo%d", i))
|
||||
_, err = client.CoreV1().Services(ns.Name).Create(tCtx, svc, metav1.CreateOptions{})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create service %s: %v", svc.Name, err)
|
||||
}
|
||||
services = append(services, svc)
|
||||
}
|
||||
|
||||
for _, service := range services {
|
||||
// Ensure the new endpoints are created.
|
||||
if err := wait.PollUntilContextTimeout(tCtx, 1*time.Second, 10*time.Second, true, func(context.Context) (bool, error) {
|
||||
_, err := client.CoreV1().Endpoints(ns.Name).Get(tCtx, service.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}); err != nil {
|
||||
t.Fatalf("endpoints not found: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update pod's status and revert it immediately. The endpoints should be in-sync with the pod's status eventually.
|
||||
pod.Status.Conditions[0].Status = v1.ConditionFalse
|
||||
pod, err = client.CoreV1().Pods(ns.Name).UpdateStatus(tCtx, pod, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to update pod %s to not ready: %v", pod.Name, err)
|
||||
}
|
||||
|
||||
pod.Status.Conditions[0].Status = v1.ConditionTrue
|
||||
pod, err = client.CoreV1().Pods(ns.Name).UpdateStatus(tCtx, pod, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to update pod %s to ready: %v", pod.Name, err)
|
||||
}
|
||||
|
||||
// Some workers might update endpoints twice (Ready->NotReady->Ready), while others may not update endpoints at all
|
||||
// if they receive the 2nd pod update quickly. Consequently, we can't rely on endpoints resource version to
|
||||
// determine if the controller has processed the pod updates. Instead, we will wait for 1 second, assuming that this
|
||||
// provides enough time for the workers to process endpoints at least once.
|
||||
time.Sleep(1 * time.Second)
|
||||
expectedEndpointAddresses := []v1.EndpointAddress{
|
||||
{
|
||||
IP: pod.Status.PodIP,
|
||||
NodeName: &pod.Spec.NodeName,
|
||||
TargetRef: &v1.ObjectReference{
|
||||
Kind: "Pod",
|
||||
Namespace: pod.Namespace,
|
||||
Name: pod.Name,
|
||||
UID: pod.UID,
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, service := range services {
|
||||
var endpoints *v1.Endpoints
|
||||
if err := wait.PollUntilContextTimeout(tCtx, 1*time.Second, 10*time.Second, true, func(context.Context) (bool, error) {
|
||||
endpoints, err = client.CoreV1().Endpoints(ns.Name).Get(tCtx, service.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
t.Logf("Error fetching endpoints: %v", err)
|
||||
return false, nil
|
||||
}
|
||||
if len(endpoints.Subsets) == 0 {
|
||||
return false, nil
|
||||
}
|
||||
if !reflect.DeepEqual(expectedEndpointAddresses, endpoints.Subsets[0].Addresses) {
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}); err != nil {
|
||||
t.Fatalf("Expected endpoints %v to contain ready endpoint addresses %v", endpoints, expectedEndpointAddresses)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestExternalNameToClusterIPTransition tests that Service of type ExternalName
|
||||
// does not get endpoints, and after transition to ClusterIP, service gets endpoint,
|
||||
// without headless label
|
||||
|
||||
Reference in New Issue
Block a user