mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-06 02:34:03 +00:00
Cluster-Autoscaler e2e to catch unnecessary scale-ups
This fails (with large probability, as issue is not 100% deterministic) on 0.5.0, catching the bug in this version.
This commit is contained in:
parent
8667d7c4f1
commit
67f5ba0004
@ -51,6 +51,7 @@ const (
|
|||||||
defaultTimeout = 3 * time.Minute
|
defaultTimeout = 3 * time.Minute
|
||||||
resizeTimeout = 5 * time.Minute
|
resizeTimeout = 5 * time.Minute
|
||||||
scaleUpTimeout = 5 * time.Minute
|
scaleUpTimeout = 5 * time.Minute
|
||||||
|
scaleUpTriggerTimeout = 2 * time.Minute
|
||||||
scaleDownTimeout = 15 * time.Minute
|
scaleDownTimeout = 15 * time.Minute
|
||||||
podTimeout = 2 * time.Minute
|
podTimeout = 2 * time.Minute
|
||||||
nodesRecoverTimeout = 5 * time.Minute
|
nodesRecoverTimeout = 5 * time.Minute
|
||||||
@ -61,6 +62,9 @@ const (
|
|||||||
disabledTaint = "DisabledForAutoscalingTest"
|
disabledTaint = "DisabledForAutoscalingTest"
|
||||||
newNodesForScaledownTests = 2
|
newNodesForScaledownTests = 2
|
||||||
unhealthyClusterThreshold = 4
|
unhealthyClusterThreshold = 4
|
||||||
|
|
||||||
|
caNoScaleUpStatus = "NoActivity"
|
||||||
|
caOngoingScaleUpStatus = "InProgress"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
||||||
@ -117,7 +121,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
|||||||
|
|
||||||
It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
It("shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
||||||
By("Creating unschedulable pod")
|
By("Creating unschedulable pod")
|
||||||
ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false)
|
ReserveMemory(f, "memory-reservation", 1, int(1.1*float64(memCapacityMb)), false, defaultTimeout)
|
||||||
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
||||||
|
|
||||||
By("Waiting for scale up hoping it won't happen")
|
By("Waiting for scale up hoping it won't happen")
|
||||||
@ -144,7 +148,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
||||||
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
|
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
|
||||||
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
||||||
|
|
||||||
// Verify, that cluster size is increased
|
// Verify, that cluster size is increased
|
||||||
@ -153,6 +157,31 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
|||||||
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
|
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
It("shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
||||||
|
status, err := getScaleUpStatus(c)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
unmanagedNodes := nodeCount - status.ready
|
||||||
|
|
||||||
|
By("Schedule more pods than can fit and wait for claster to scale-up")
|
||||||
|
ReserveMemory(f, "memory-reservation", 100, (nodeCount+2)*memCapacityMb, false, 1*time.Second)
|
||||||
|
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
||||||
|
|
||||||
|
status, err = waitForScaleUpStatus(c, caOngoingScaleUpStatus, scaleUpTriggerTimeout)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
target := status.target
|
||||||
|
framework.ExpectNoError(waitForAllCaPodsReadyInNamespace(f, c))
|
||||||
|
|
||||||
|
By("Expect no more scale-up to be happening after all pods are scheduled")
|
||||||
|
status, err = getScaleUpStatus(c)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
if status.target != target {
|
||||||
|
glog.Warningf("Final number of nodes (%v) does not match initial scale-up target (%v).", status.target, target)
|
||||||
|
}
|
||||||
|
Expect(status.status).Should(Equal(caNoScaleUpStatus))
|
||||||
|
Expect(status.ready).Should(Equal(status.target))
|
||||||
|
Expect(len(framework.GetReadySchedulableNodesOrDie(f.ClientSet).Items)).Should(Equal(status.target + unmanagedNodes))
|
||||||
|
})
|
||||||
|
|
||||||
It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
It("should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]", func() {
|
||||||
framework.SkipUnlessProviderIs("gke")
|
framework.SkipUnlessProviderIs("gke")
|
||||||
|
|
||||||
@ -163,7 +192,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
|||||||
framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout))
|
framework.ExpectNoError(framework.WaitForClusterSize(c, nodeCount+1, resizeTimeout))
|
||||||
glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")
|
glog.Infof("Not enabling cluster autoscaler for the node pool (on purpose).")
|
||||||
|
|
||||||
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
|
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
|
||||||
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
||||||
|
|
||||||
// Verify, that cluster size is increased
|
// Verify, that cluster size is increased
|
||||||
@ -295,7 +324,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
|||||||
framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
|
framework.ExpectNoError(enableAutoscaler(extraPoolName, 1, 2))
|
||||||
|
|
||||||
By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool")
|
By("Creating rc with 2 pods too big to fit default-pool but fitting extra-pool")
|
||||||
ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false)
|
ReserveMemory(f, "memory-reservation", 2, int(2.1*float64(memCapacityMb)), false, defaultTimeout)
|
||||||
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
||||||
|
|
||||||
// Apparently GKE master is restarted couple minutes after the node pool is added
|
// Apparently GKE master is restarted couple minutes after the node pool is added
|
||||||
@ -383,7 +412,7 @@ var _ = framework.KubeDescribe("Cluster size autoscaling [Slow]", func() {
|
|||||||
nodesToBreak = nodesToBreak[1:]
|
nodesToBreak = nodesToBreak[1:]
|
||||||
framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
|
framework.TestUnderTemporaryNetworkFailure(c, "default", ntb, testFunction)
|
||||||
} else {
|
} else {
|
||||||
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false)
|
ReserveMemory(f, "memory-reservation", 100, nodeCount*memCapacityMb, false, defaultTimeout)
|
||||||
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
defer framework.DeleteRCAndPods(f.ClientSet, f.InternalClientset, f.Namespace.Name, "memory-reservation")
|
||||||
time.Sleep(scaleUpTimeout)
|
time.Sleep(scaleUpTimeout)
|
||||||
currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
|
currentNodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
|
||||||
@ -622,7 +651,7 @@ func CreateNodeSelectorPods(f *framework.Framework, id string, replicas int, nod
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool) {
|
func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, expectRunning bool, timeout time.Duration) {
|
||||||
By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
|
By(fmt.Sprintf("Running RC which reserves %v MB of memory", megabytes))
|
||||||
request := int64(1024 * 1024 * megabytes / replicas)
|
request := int64(1024 * 1024 * megabytes / replicas)
|
||||||
config := &testutils.RCConfig{
|
config := &testutils.RCConfig{
|
||||||
@ -630,7 +659,7 @@ func ReserveMemory(f *framework.Framework, id string, replicas, megabytes int, e
|
|||||||
InternalClient: f.InternalClientset,
|
InternalClient: f.InternalClientset,
|
||||||
Name: id,
|
Name: id,
|
||||||
Namespace: f.Namespace.Name,
|
Namespace: f.Namespace.Name,
|
||||||
Timeout: defaultTimeout,
|
Timeout: timeout,
|
||||||
Image: framework.GetPauseImageName(f.ClientSet),
|
Image: framework.GetPauseImageName(f.ClientSet),
|
||||||
Replicas: replicas,
|
Replicas: replicas,
|
||||||
MemRequest: request,
|
MemRequest: request,
|
||||||
@ -896,3 +925,65 @@ func getClusterwideStatus(c clientset.Interface) (string, error) {
|
|||||||
}
|
}
|
||||||
return result[1], nil
|
return result[1], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type scaleUpStatus struct {
|
||||||
|
status string
|
||||||
|
ready int
|
||||||
|
target int
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to get scaleup statuses of all node groups.
|
||||||
|
// Status configmap is not parsing-friendly, so evil regexpery follows.
|
||||||
|
func getScaleUpStatus(c clientset.Interface) (*scaleUpStatus, error) {
|
||||||
|
configMap, err := c.CoreV1().ConfigMaps("kube-system").Get("cluster-autoscaler-status", metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
status, ok := configMap.Data["status"]
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("Status information not found in configmap")
|
||||||
|
}
|
||||||
|
matcher, err := regexp.Compile("s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
matches := matcher.FindAllStringSubmatch(status, -1)
|
||||||
|
if len(matches) < 1 {
|
||||||
|
return nil, fmt.Errorf("Failed to parse CA status configmap")
|
||||||
|
}
|
||||||
|
result := scaleUpStatus{
|
||||||
|
status: caNoScaleUpStatus,
|
||||||
|
ready: 0,
|
||||||
|
target: 0,
|
||||||
|
}
|
||||||
|
for _, match := range matches {
|
||||||
|
if match[1] == caOngoingScaleUpStatus {
|
||||||
|
result.status = caOngoingScaleUpStatus
|
||||||
|
}
|
||||||
|
newReady, err := strconv.Atoi(match[2])
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
result.ready += newReady
|
||||||
|
newTarget, err := strconv.Atoi(match[3])
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
result.target += newTarget
|
||||||
|
}
|
||||||
|
glog.Infof("Cluster-Autoscaler scale-up status: %v (%v, %v)", result.status, result.ready, result.target)
|
||||||
|
return &result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func waitForScaleUpStatus(c clientset.Interface, expected string, timeout time.Duration) (*scaleUpStatus, error) {
|
||||||
|
for start := time.Now(); time.Since(start) < timeout; time.Sleep(5 * time.Second) {
|
||||||
|
status, err := getScaleUpStatus(c)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if status.status == expected {
|
||||||
|
return status, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("ScaleUp status did not reach expected value: %v", expected)
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user