mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-21 19:01:49 +00:00
Merge pull request #19446 from justinsb/aws_reboot_master_on_failure
AWS: Create a cloudwatch alarm to reboot the master on failure
This commit is contained in:
commit
f788e1e11a
@ -435,6 +435,44 @@ function ensure-master-pd {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Configures a CloudWatch alarm to reboot the instance on failure
|
||||||
|
function reboot-on-failure {
|
||||||
|
local instance_id=$1
|
||||||
|
|
||||||
|
echo "Creating Cloudwatch alarm to reboot instance ${instance_id} on failure"
|
||||||
|
|
||||||
|
local aws_owner_id=`aws ec2 describe-instances --instance-ids ${instance_id} --query Reservations[0].OwnerId`
|
||||||
|
if [[ -z "${aws_owner_id}" ]]; then
|
||||||
|
echo "Unable to determinate AWS account id for ${instance_id}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
aws cloudwatch put-metric-alarm \
|
||||||
|
--alarm-name k8s-${instance_id}-statuscheckfailure-reboot \
|
||||||
|
--alarm-description "Reboot ${instance_id} on status check failure" \
|
||||||
|
--namespace "AWS/EC2" \
|
||||||
|
--dimensions Name=InstanceId,Value=${instance_id} \
|
||||||
|
--statistic Minimum \
|
||||||
|
--metric-name StatusCheckFailed \
|
||||||
|
--comparison-operator GreaterThanThreshold \
|
||||||
|
--threshold 0 \
|
||||||
|
--period 60 \
|
||||||
|
--evaluation-periods 3 \
|
||||||
|
--alarm-actions arn:aws:swf:${AWS_REGION}:${aws_owner_id}:action/actions/AWS_EC2.InstanceId.Reboot/1.0 > $LOG
|
||||||
|
|
||||||
|
# TODO: The IAM role EC2ActionsAccess must have been created
|
||||||
|
# See e.g. http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/UsingIAM.html
|
||||||
|
}
|
||||||
|
|
||||||
|
function delete-instance-alarms {
|
||||||
|
local instance_id=$1
|
||||||
|
|
||||||
|
alarm_names=`aws cloudwatch describe-alarms --alarm-name-prefix k8s-${instance_id}- --query MetricAlarms[].AlarmName`
|
||||||
|
for alarm_name in ${alarm_names}; do
|
||||||
|
aws cloudwatch delete-alarms --alarm-names ${alarm_name} > $LOG
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# Creates a new DHCP option set configured correctly for Kubernetes
|
# Creates a new DHCP option set configured correctly for Kubernetes
|
||||||
# Sets DHCP_OPTION_SET_ID
|
# Sets DHCP_OPTION_SET_ID
|
||||||
function create-dhcp-option-set () {
|
function create-dhcp-option-set () {
|
||||||
@ -1010,6 +1048,8 @@ function start-master() {
|
|||||||
attempt=$(($attempt+1))
|
attempt=$(($attempt+1))
|
||||||
sleep 10
|
sleep 10
|
||||||
done
|
done
|
||||||
|
|
||||||
|
reboot-on-failure ${master_id}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Creates an ASG for the minion nodes
|
# Creates an ASG for the minion nodes
|
||||||
@ -1201,6 +1241,13 @@ function kube-down {
|
|||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [[ -z "${KUBE_MASTER_ID-}" ]]; then
|
||||||
|
KUBE_MASTER_ID=$(get_instanceid_from_name ${MASTER_NAME})
|
||||||
|
fi
|
||||||
|
if [[ -n "${KUBE_MASTER_ID-}" ]]; then
|
||||||
|
delete-instance-alarms ${KUBE_MASTER_ID}
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Deleting instances in VPC: ${vpc_id}"
|
echo "Deleting instances in VPC: ${vpc_id}"
|
||||||
instance_ids=$($AWS_CMD describe-instances \
|
instance_ids=$($AWS_CMD describe-instances \
|
||||||
--filters Name=vpc-id,Values=${vpc_id} \
|
--filters Name=vpc-id,Values=${vpc_id} \
|
||||||
|
Loading…
Reference in New Issue
Block a user