mirror of
https://github.com/rancher/rke.git
synced 2025-09-12 21:24:00 +00:00
Change RKE upgrade logic for zero downtime
This commit is contained in:
35
k8s/node.go
35
k8s/node.go
@@ -18,6 +18,8 @@ const (
|
||||
InternalAddressAnnotation = "rke.cattle.io/internal-ip"
|
||||
ExternalAddressAnnotation = "rke.cattle.io/external-ip"
|
||||
AWSCloudProvider = "aws"
|
||||
MaxRetries = 5
|
||||
RetryInterval = 5
|
||||
)
|
||||
|
||||
func DeleteNode(k8sClient *kubernetes.Clientset, nodeName, cloudProvider string) error {
|
||||
@@ -37,26 +39,35 @@ func GetNodeList(k8sClient *kubernetes.Clientset) (*v1.NodeList, error) {
|
||||
}
|
||||
|
||||
func GetNode(k8sClient *kubernetes.Clientset, nodeName string) (*v1.Node, error) {
|
||||
nodes, err := GetNodeList(k8sClient)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, node := range nodes.Items {
|
||||
if strings.ToLower(node.Labels[HostnameLabel]) == strings.ToLower(nodeName) {
|
||||
return &node, nil
|
||||
var listErr error
|
||||
for retries := 0; retries < MaxRetries; retries++ {
|
||||
nodes, err := GetNodeList(k8sClient)
|
||||
if err != nil {
|
||||
listErr = err
|
||||
time.Sleep(time.Second * RetryInterval)
|
||||
continue
|
||||
}
|
||||
for _, node := range nodes.Items {
|
||||
if strings.ToLower(node.Labels[HostnameLabel]) == strings.ToLower(nodeName) {
|
||||
return &node, nil
|
||||
}
|
||||
}
|
||||
time.Sleep(time.Second * RetryInterval)
|
||||
}
|
||||
if listErr != nil {
|
||||
return nil, listErr
|
||||
}
|
||||
return nil, apierrors.NewNotFound(schema.GroupResource{}, nodeName)
|
||||
}
|
||||
|
||||
func CordonUncordon(k8sClient *kubernetes.Clientset, nodeName string, cordoned bool) error {
|
||||
updated := false
|
||||
for retries := 0; retries <= 5; retries++ {
|
||||
for retries := 0; retries < MaxRetries; retries++ {
|
||||
node, err := GetNode(k8sClient, nodeName)
|
||||
if err != nil {
|
||||
logrus.Debugf("Error getting node %s: %v", nodeName, err)
|
||||
time.Sleep(time.Second * 5)
|
||||
continue
|
||||
// no need to retry here since GetNode already retries
|
||||
return err
|
||||
}
|
||||
if node.Spec.Unschedulable == cordoned {
|
||||
logrus.Debugf("Node %s is already cordoned: %v", nodeName, cordoned)
|
||||
@@ -66,7 +77,7 @@ func CordonUncordon(k8sClient *kubernetes.Clientset, nodeName string, cordoned b
|
||||
_, err = k8sClient.CoreV1().Nodes().Update(node)
|
||||
if err != nil {
|
||||
logrus.Debugf("Error setting cordoned state for node %s: %v", nodeName, err)
|
||||
time.Sleep(time.Second * 5)
|
||||
time.Sleep(time.Second * RetryInterval)
|
||||
continue
|
||||
}
|
||||
updated = true
|
||||
@@ -80,7 +91,7 @@ func CordonUncordon(k8sClient *kubernetes.Clientset, nodeName string, cordoned b
|
||||
func IsNodeReady(node v1.Node) bool {
|
||||
nodeConditions := node.Status.Conditions
|
||||
for _, condition := range nodeConditions {
|
||||
if condition.Type == "Ready" && condition.Status == v1.ConditionTrue {
|
||||
if condition.Type == v1.NodeReady && condition.Status == v1.ConditionTrue {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user