127 lines
5 KiB
YAML
127 lines
5 KiB
YAML
|
# ============================================================
|
||
|
# This file attempts to obtain a global lock (for a given
|
||
|
# region / account combination.
|
||
|
#
|
||
|
# This makes one attempt to get the lock and will set the
|
||
|
# won_lock variable to True or False to indicate whether
|
||
|
# or not we got the lock.
|
||
|
#
|
||
|
# It's expected that this will be executed in a retry loop
|
||
|
# so that if we don't get the lock we delay then try again.
|
||
|
#
|
||
|
# This should only be used in a block with cleanup-lock.yaml
|
||
|
# included in the always clause to ensure the lock is released.
|
||
|
#
|
||
|
# There are several variables that control the locking behaviour:
|
||
|
# * lock_timeout_seconds
|
||
|
# How old a lock must be before it's assumed to be an expired
|
||
|
# lock that was not cleaned up by the owner. Any locks older
|
||
|
# than this will not prevent a lock being obtained and will
|
||
|
# be deleted when a new process obtains the lock.
|
||
|
# * lock_log_group_prefix
|
||
|
# The log_group prefix that represents the lock being obtained.
|
||
|
# This must be the same across all processes trying to obtain
|
||
|
# the lock.
|
||
|
# * lock_process_id
|
||
|
# A unique identifier of this process. Each process that might
|
||
|
# attempt to lock the process must have a different identifier.
|
||
|
# This defaults to the resource_prefix which is generally
|
||
|
# appropriate.
|
||
|
# * max_obtain_lock_attempts
|
||
|
# How many attempts to make to get the lock before giving up
|
||
|
# NB: This is actually done in main.yaml
|
||
|
# * obtain_lock_delay_seconds:
|
||
|
# How long to delay after failing to get the lock before
|
||
|
# trying again.
|
||
|
# NB: This is actually done in obtain-lock-wrapper.yaml
|
||
|
#
|
||
|
# The locking here is based around creating cloudwatch log groups.
|
||
|
# This resource was chosen because:
|
||
|
# A) it's free
|
||
|
# B) we have a built in grouping concept because of the hierarchy
|
||
|
# that allows us to easily group attempts for the same lock
|
||
|
# C) the creation time is tracked and returned which gives us
|
||
|
# a mechanism for deterministically picking a winner
|
||
|
#
|
||
|
# Each lock is represented by a log group prefix. Each attempt
|
||
|
# to obtain the lock is a log group of the lock_process_id below
|
||
|
# that prefix.
|
||
|
#
|
||
|
# The winning lock is the one with the earliest creation time.
|
||
|
#
|
||
|
# To prevent a hanging lock from permanently hanging the build
|
||
|
# lock attempts older than the lock timeout are ignored and
|
||
|
# cleaned up by the next process to win the lock.
|
||
|
# ============================================================
|
||
|
|
||
|
- name: set up aws connection info
|
||
|
set_fact:
|
||
|
aws_connection_info: &aws_connection_info
|
||
|
aws_access_key: "{{ aws_access_key }}"
|
||
|
aws_secret_key: "{{ aws_secret_key }}"
|
||
|
security_token: "{{ security_token }}"
|
||
|
region: "{{ aws_region }}"
|
||
|
no_log: yes
|
||
|
|
||
|
- name: Set lock_attempt_log_group_name
|
||
|
set_fact:
|
||
|
lock_attempt_log_group_name: "{{ lock_log_group_prefix }}/{{ lock_process_id|default(resource_prefix) }}"
|
||
|
|
||
|
# Note the overwrite below to ensure that the creation time
|
||
|
# is upated. This is important as we calculate expiry relative
|
||
|
# the attempt creation.
|
||
|
#
|
||
|
# Because of this it's imporatnt that we delete the attempt
|
||
|
# if we don't get the lock. Otherwise we can get a deadlock
|
||
|
# where the stale atttempt from one process wins, but then
|
||
|
# because that process updates the creation date it doesn't
|
||
|
# consider its self to havewone.
|
||
|
- name: Create Lock Attempt Log Group
|
||
|
cloudwatchlogs_log_group:
|
||
|
log_group_name: "{{ lock_attempt_log_group_name }}"
|
||
|
state: present
|
||
|
overwrite: True
|
||
|
<<: *aws_connection_info
|
||
|
register: lock_attempt_log_group_result
|
||
|
|
||
|
- name: Get Lock Attempt Lock Groups
|
||
|
cloudwatchlogs_log_group_facts:
|
||
|
log_group_name: "{{ lock_log_group_prefix }}/"
|
||
|
<<: *aws_connection_info
|
||
|
register: lock_attempt_log_groups
|
||
|
|
||
|
- name: Calculate Expired Lock Attempt Timestamp
|
||
|
set_fact:
|
||
|
expired_lock_timestamp: "{{ lock_attempt_log_group_result.creation_time - (lock_timeout_seconds * 1000) }}"
|
||
|
|
||
|
- name: Get Expired and Active Lock Attempts
|
||
|
set_fact:
|
||
|
expired_lock_attempts: "{{ lock_attempt_log_groups.log_groups|selectattr('creation_time', 'lt', expired_lock_timestamp|int)|list }}"
|
||
|
active_lock_attempts: "{{ lock_attempt_log_groups.log_groups|selectattr('creation_time', 'ge', expired_lock_timestamp|int)|list }}"
|
||
|
|
||
|
- name: Pick Winning Lock Attempt
|
||
|
set_fact:
|
||
|
winning_lock_attempt: "{{ active_lock_attempts|sort(attribute='creation_time')|first }}"
|
||
|
|
||
|
- name: Determine if Won Lock
|
||
|
set_fact:
|
||
|
won_lock: "{{ winning_lock_attempt.log_group_name == lock_attempt_log_group_name }}"
|
||
|
|
||
|
# Remove the lock attempt if we didn't get the lock. This prevents
|
||
|
# our stale lock attempt blocking another process from getting the lock.
|
||
|
# See more detailed comment above Create Lock Attempt Log Group
|
||
|
- name: Remove Failed Lock Attempt Log Group
|
||
|
cloudwatchlogs_log_group:
|
||
|
log_group_name: "{{ lock_attempt_log_group_name }}"
|
||
|
state: absent
|
||
|
<<: *aws_connection_info
|
||
|
when: "not won_lock|bool"
|
||
|
|
||
|
- name: Delete Expired Lock Attempts
|
||
|
cloudwatchlogs_log_group:
|
||
|
log_group_name: "{{ item.log_group_name }}"
|
||
|
state: absent
|
||
|
<<: *aws_connection_info
|
||
|
when: "won_lock|bool"
|
||
|
loop: "{{ expired_lock_attempts }}"
|