Skip to content

Commit 2b5f44f

Browse files
Merge pull request #425 from NHSDigital/dp-force-unlock
Dp force unlock
2 parents f6990ca + aed6aa5 commit 2b5f44f

File tree

5 files changed

+136
-50
lines changed

5 files changed

+136
-50
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,6 @@ endif
4040

4141
ansible: guard-cmd
4242
@account=$(account) poetry run make --no-print-directory -C ansible $(cmd)
43+
44+
remove-stale-locks:
45+
@poetry run python ./scripts/terraform_force_unlock.py

azure/components/cleanup-ecs-pr-proxies-job.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,22 @@ steps:
99
profile: "apm_ptl"
1010

1111
- bash: |
12+
make remove-stale-locks
1213
export retain_hours=72
13-
ANSIBLE_FORCE_COLOR=yes make -C ansible remove-old-ecs-pr-deploys > /tmp/err.txt
14+
ANSIBLE_FORCE_COLOR=yes make -C ansible remove-old-ecs-pr-deploys | tee /tmp/output.txt
1415
ERROR_CODE=$?
1516
ROLE_TIMEOUT_MSG="The AWS assume role session token is due to expire"
16-
if grep -q "$ROLE_TIMEOUT_MSG" /tmp/err.txt ; then
17+
if grep -q "$ROLE_TIMEOUT_MSG" /tmp/output.txt ; then
1718
echo "stderr for ansible has the error \"$ROLE_TIMEOUT_MSG\""
1819
echo "Re-assuming role and re-running step"
1920
echo "##vso[task.setvariable variable=has_aws_role_timedout;]true"
2021
2122
elif [ $ERROR_CODE -ne 0 ] ; then
22-
echo "ansible has unhandled error, re-raising"
23-
>&2 cat /tmp/err.txt
23+
echo "\n\nansible has unhandled error, re-raising"
2424
exit -1
2525
else
2626
echo "##vso[task.setvariable variable=has_aws_role_timedout;]false"
2727
fi
28-
cat /tmp/err.txt
2928
3029
displayName: "cleanup older pr deploys"
3130
condition: or(eq( ${{ parameters.retry }}, '0'), eq(variables['has_aws_role_timedout'], 'true'))

poetry.lock

Lines changed: 59 additions & 45 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ gitpython = "^3.1.11"
2525
pytest-xdist = "^1.10.0"
2626
lxml = "^4.6.2"
2727
docker = "^5.0.3"
28+
boto3 = "^1.26.20"
2829

2930
[tool.poetry.dev-dependencies]
3031
ansible-lint = "^4.2.0"

scripts/terraform_force_unlock.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
Script to force unlock locks in terraform with a given prefix and of a certain age.
3+
4+
Warning this script should be used with care.
5+
6+
This script has to be used because sometimes the ecs pr pipeline cleanup won't always properly release the lock when
7+
it fails.
8+
9+
CLI arguments:
10+
--min-age-hr
11+
--key-prefix
12+
--table-name
13+
--profile
14+
15+
Example usage:
16+
17+
python ./terraform_force_unlock.py --min-age-hr=8 --key-prefix=nhsd-apm-management-ptl-terraform/env:/api-deployment:ptl: --table-name=terraform-state-lock --profile=apm_ptl
18+
19+
"""
20+
21+
import json
22+
import boto3
23+
import dateutil
24+
import datetime
25+
import click
26+
27+
28+
@click.command()
29+
@click.option("--min-age-hr", type=int, default=8)
30+
@click.option("--key-prefix", type=str, default="nhsd-apm-management-ptl-terraform/env:/api-deployment:ptl:")
31+
@click.option("--table-name", type=str, default="terraform-state-lock")
32+
@click.option("--profile", type=str, default="apm_ptl")
33+
def main(min_age_hr, key_prefix, table_name, profile):
34+
35+
accepted_envs = ["apm_ptl", "apm_prod"]
36+
37+
if profile not in accepted_envs:
38+
raise ValueError("Profile must be apm_ptl or apm_prod")
39+
40+
terraform_lock_table = boto3.Session(profile_name=profile).resource("dynamodb").Table(table_name)
41+
42+
filter_expr = "begins_with(#n0, :v0) AND attribute_exists(#n1)"
43+
44+
ExpressionAttributeNames = {"#n0": "LockID", "#n1": "Info"}
45+
ExpressionAttributeValues = {
46+
":v0": key_prefix,
47+
}
48+
items = terraform_lock_table.scan(FilterExpression=filter_expr, ExpressionAttributeNames=ExpressionAttributeNames, ExpressionAttributeValues=ExpressionAttributeValues)
49+
print(f"Found {len(items['Items'])} locks which start with key prefix '{key_prefix}'")
50+
51+
removed_count = 0
52+
for lock_item in items["Items"]:
53+
lock_item_info = json.loads(lock_item["Info"])
54+
lock_id = lock_item["LockID"]
55+
created_at = dateutil.parser.parse(lock_item_info["Created"])
56+
57+
if datetime.datetime.now(datetime.timezone.utc) - created_at > datetime.timedelta(hours=min_age_hr):
58+
print(f"{lock_id} {created_at=} is more than {min_age_hr} hours old, deleting lock...")
59+
terraform_lock_table.delete_item(Key={"LockID": lock_id})
60+
removed_count += 1
61+
62+
else:
63+
print(f"{lock_id} {created_at=} is not more than {min_age_hr} hours old, leaving it alone!")
64+
65+
print(f"Removed {removed_count} locks")
66+
67+
68+
if __name__ == "__main__":
69+
main()

0 commit comments

Comments
 (0)