From d91ad0bb6fd5d489727e934482fe0d1d0ca6cf88 Mon Sep 17 00:00:00 2001 From: Justin Santa Barbara Date: Sat, 9 Jan 2016 14:08:38 -0500 Subject: [PATCH] AWS: Create a cloudwatch alarm to reboot the master on failure This is an easier alternative to keep a master running than trying to dynamically find & attach master volumes. To actually work, it requires that users create the EC2ActionsAccess IAM role as required by CloudWatch, see e.g. http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/UsingIAM.html --- cluster/aws/util.sh | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/cluster/aws/util.sh b/cluster/aws/util.sh index b34f1bacd4a..8e1bb5b308f 100755 --- a/cluster/aws/util.sh +++ b/cluster/aws/util.sh @@ -435,6 +435,44 @@ function ensure-master-pd { fi } +# Configures a CloudWatch alarm to reboot the instance on failure +function reboot-on-failure { + local instance_id=$1 + + echo "Creating Cloudwatch alarm to reboot instance ${instance_id} on failure" + + local aws_owner_id=`aws ec2 describe-instances --instance-ids ${instance_id} --query Reservations[0].OwnerId` + if [[ -z "${aws_owner_id}" ]]; then + echo "Unable to determinate AWS account id for ${instance_id}" + exit 1 + fi + + aws cloudwatch put-metric-alarm \ + --alarm-name k8s-${instance_id}-statuscheckfailure-reboot \ + --alarm-description "Reboot ${instance_id} on status check failure" \ + --namespace "AWS/EC2" \ + --dimensions Name=InstanceId,Value=${instance_id} \ + --statistic Minimum \ + --metric-name StatusCheckFailed \ + --comparison-operator GreaterThanThreshold \ + --threshold 0 \ + --period 60 \ + --evaluation-periods 3 \ + --alarm-actions arn:aws:swf:${AWS_REGION}:${aws_owner_id}:action/actions/AWS_EC2.InstanceId.Reboot/1.0 > $LOG + + # TODO: The IAM role EC2ActionsAccess must have been created + # See e.g. http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/UsingIAM.html +} + +function delete-instance-alarms { + local instance_id=$1 + + alarm_names=`aws cloudwatch describe-alarms --alarm-name-prefix k8s-${instance_id}- --query MetricAlarms[].AlarmName` + for alarm_name in ${alarm_names}; do + aws cloudwatch delete-alarms --alarm-names ${alarm_name} > $LOG + done +} + # Creates a new DHCP option set configured correctly for Kubernetes # Sets DHCP_OPTION_SET_ID function create-dhcp-option-set () { @@ -1010,6 +1048,8 @@ function start-master() { attempt=$(($attempt+1)) sleep 10 done + + reboot-on-failure ${master_id} } # Creates an ASG for the minion nodes @@ -1201,6 +1241,13 @@ function kube-down { done fi + if [[ -z "${KUBE_MASTER_ID-}" ]]; then + KUBE_MASTER_ID=$(get_instanceid_from_name ${MASTER_NAME}) + fi + if [[ -n "${KUBE_MASTER_ID-}" ]]; then + delete-instance-alarms ${KUBE_MASTER_ID} + fi + echo "Deleting instances in VPC: ${vpc_id}" instance_ids=$($AWS_CMD describe-instances \ --filters Name=vpc-id,Values=${vpc_id} \