diff options
author | Jenkins <jenkins@review.openstack.org> | 2017-02-07 01:15:07 +0000 |
---|---|---|
committer | Gerrit Code Review <review@openstack.org> | 2017-02-07 01:15:07 +0000 |
commit | 570574906087be436d86efc5af25e61b41189a89 (patch) | |
tree | f9f2f7f64c114a3d4b934aa3c90b68db3cff6b13 /hot | |
parent | 824b3567ed1a0dbc328ce19b9e4d492f1f407c30 (diff) | |
parent | c975f6b1797f1e2b51a9befe3534c38980a19eb2 (diff) | |
download | heat-templates-570574906087be436d86efc5af25e61b41189a89.tar.gz |
Merge "Add template for autohealing servers"
Diffstat (limited to 'hot')
-rw-r--r-- | hot/autohealing/autohealing_group.yaml | 60 | ||||
-rw-r--r-- | hot/autohealing/autohealing_server.yaml | 170 |
2 files changed, 230 insertions, 0 deletions
diff --git a/hot/autohealing/autohealing_group.yaml b/hot/autohealing/autohealing_group.yaml new file mode 100644 index 0000000..069a7e8 --- /dev/null +++ b/hot/autohealing/autohealing_group.yaml @@ -0,0 +1,60 @@ +heat_template_version: 2017-02-24 + +description: > + A stack containing an Autoscaling Group whose members automatically heal + themselves if they are stopped, deleted, or go into an error state, using an + Aodh alarm delivered to a Zaqar queue that triggers a Mistral workflow to + replace the stopped server. Note that this requires event alarms to be + enabled in Aodh, following the instructions at + http://docs.openstack.org/developer/aodh/event-alarm.html - specifically by + adding the publisher "notifier://?topic=alarm.all" in + /etc/ceilometer/event_pipeline.yaml. + +parameters: + flavor: + type: string + description: Flavor for the instances to be created + default: cirros256 + constraints: + - custom_constraint: nova.flavor + description: Must be a flavor known to Nova + image: + type: string + description: > + Name or ID of the image to use for the instances. + default: cirros-0.3.4-x86_64-uec + constraints: + - custom_constraint: glance.image + description: Must identify an image known to Glance + network: + type: string + description: The network for the VM + default: private + port: + type: number + description: The port to reply to requests on + default: 8080 + +resources: + servers: + type: OS::Heat::AutoScalingGroup + properties: + resource: + type: autohealing_server.yaml + properties: + flavor: {get_param: flavor} + image: {get_param: image} + network: {get_param: network} + port: {get_param: port} + root_stack_id: {get_param: "OS::stack_id"} + min_size: 1 + desired_capacity: 2 + max_size: 4 + +outputs: + server_ids: + description: A list of the current server UUIDs + value: {get_attr: [servers, refs]} + ip_addresses: + description: A list of server IP addresses + value: {get_attr: [servers, outputs_list, first_address]} diff --git a/hot/autohealing/autohealing_server.yaml b/hot/autohealing/autohealing_server.yaml new file mode 100644 index 0000000..732451f --- /dev/null +++ b/hot/autohealing/autohealing_server.yaml @@ -0,0 +1,170 @@ +heat_template_version: 2017-02-24 + +description: > + A stack containing a server that is automatically replaced if it is stopped, + deleted, or goes into an error state, using an Aodh alarm delivered to a + Zaqar queue that triggers a Mistral workflow. This may be either be used + standalone, or as the scaled unit of a scaling group. When using this from + inside another template, the 'root_stack_id' parameter should be passed to + indicate at which stack the stack update should commence after marking the + server as failed. This should be the root-level stack, to ensure that any + other resources depending on outputs from this stack are also updated. Note + that this requires event alarms to be enabled in Aodh, following the + instructions at http://docs.openstack.org/developer/aodh/event-alarm.html - + specifically, by adding the publisher "notifier://?topic=alarm.all" in + /etc/ceilometer/event_pipeline.yaml. + +parameters: + flavor: + type: string + description: Flavor for the instances to be created + default: cirros256 + constraints: + - custom_constraint: nova.flavor + description: Must be a flavor known to Nova + image: + type: string + description: > + Name or ID of the image to use for the instances. + default: cirros-0.3.4-x86_64-uec + constraints: + - custom_constraint: glance.image + description: Must identify an image known to Glance + network: + type: string + description: The network for the VM + default: private + port: + type: number + description: The port to reply to requests on + default: 8080 + root_stack_id: + type: string + default: "" + +conditions: + is_standalone: {equals: [{get_param: root_stack_id}, ""]} + +resources: + server: + type: OS::Nova::Server + properties: + image: {get_param: image} + flavor: {get_param: flavor} + networks: + - network: {get_param: network} + user_data_format: RAW + user_data: + str_replace: + template: | + #! /bin/sh -v + Body=$(hostname) + Response="HTTP/1.1 200 OK\r\nContent-Length: ${#Body}\r\n\r\n$Body" + while true ; do echo -e $Response | nc -llp %PORT%; done + params: + "%PORT%": {get_param: port} + + alarm_queue: + type: OS::Zaqar::Queue + + stop_event_alarm: + type: OS::Aodh::EventAlarm + properties: + event_type: compute.instance.update + query: + - field: traits.instance_id + value: {get_resource: server} + op: eq + - field: traits.state + value: stopped + op: eq + alarm_queues: + - {get_resource: alarm_queue} + + error_event_alarm: + type: OS::Aodh::EventAlarm + properties: + event_type: compute.instance.update + query: + - field: traits.instance_id + value: {get_resource: server} + op: eq + - field: traits.state + value: error + op: eq + alarm_queues: + - {get_resource: alarm_queue} + + deleted_event_alarm: + type: OS::Aodh::EventAlarm + properties: + event_type: compute.instance.delete.start + query: + - field: traits.instance_id + value: {get_resource: server} + op: eq + alarm_queues: + - {get_resource: alarm_queue} + + # The Aodh event alarm does not take effect immediately; it may take up to + # 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's + # alarm data to be loaded. This resource ensures the stack is not completed + # until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273 + alarm_cache_wait: + type: OS::Heat::TestResource + properties: + action_wait_secs: + create: 60 + update: 60 + value: + list_join: + - '' + - - {get_attr: [stop_event_alarm, show]} + - {get_attr: [error_event_alarm, show]} + - {get_attr: [deleted_event_alarm, show]} + + alarm_subscription: + type: OS::Zaqar::MistralTrigger + properties: + queue_name: {get_resource: alarm_queue} + workflow_id: {get_resource: autoheal} + input: + stack_id: {get_param: "OS::stack_id"} + root_stack_id: + if: + - is_standalone + - {get_param: "OS::stack_id"} + - {get_param: "root_stack_id"} + + autoheal: + type: OS::Mistral::Workflow + properties: + description: > + Mark a server as unhealthy and commence a stack update to replace it. + input: + stack_id: + root_stack_id: + type: direct + tasks: + - name: resources_mark_unhealthy + action: + list_join: + - ' ' + - - heat.resources_mark_unhealthy + - stack_id=<% $.stack_id %> + - resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %> + - mark_unhealthy=true + - resource_status_reason='Marked by alarm' + on_success: + - stacks_update + - name: stacks_update + action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true + +outputs: + OS::stack_id: + description: The server UUID + value: {get_resource: server} + condition: {not: is_standalone} + first_address: + description: The server IP address + value: {get_attr: [server, first_address]} |