Skip to content

Commit 23100bc

Browse files
author
Bob Strahan
committed
Fix Pattern-2 ECR image availability race condition during deployment
1 parent 6a1d055 commit 23100bc

File tree

2 files changed

+104
-1
lines changed

2 files changed

+104
-1
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,17 @@ SPDX-License-Identifier: MIT-0
55

66
## [Unreleased]
77

8+
### Fixed
9+
10+
- **Pattern-2 Intermittent HITLStatusUpdateFunction ECR Access Failure**
11+
- Fixed intermittent "Lambda does not have permission to access the ECR image" (403) errors during Pattern-2 deployment
12+
- **Root Cause**: Race condition where Lambda functions were created before ECR images were fully available and scannable
13+
- **Solution**: Enhanced CodeBuild custom resource to verify ECR image availability before completing, including:
14+
- Verification that all 9 required Lambda images exist in ECR repository
15+
- Check that image scanning is complete (repository has `ScanOnPush: true`)
16+
- Polling mechanism that waits for images to be fully ready before allowing Lambda creation
17+
- **Impact**: Eliminates deployment failures and ensures reliable stack creation on first attempt
18+
819
## [0.4.1]
920

1021
### Changed

src/lambda/start_codebuild/index.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,77 @@ def create_or_update(event, _):
6060
raise ValueError(f"invalid resource type: {resource_type}")
6161

6262

63+
def _verify_ecr_images_available(ecr_uri: str, image_version: str) -> bool:
64+
"""Verify all required Lambda images exist in ECR and are pullable.
65+
66+
Args:
67+
ecr_uri: ECR repository URI (e.g., 123456789012.dkr.ecr.us-east-1.amazonaws.com/repo-name)
68+
image_version: Image version tag (e.g., "latest" or "0.3.19")
69+
70+
Returns:
71+
True if all images are available and scannable, False otherwise
72+
"""
73+
try:
74+
repository_name = ecr_uri.split("/")[-1]
75+
76+
# List of all image tags used by Lambda functions in Pattern 2
77+
required_images = [
78+
f"ocr-function-{image_version}",
79+
f"classification-function-{image_version}",
80+
f"extraction-function-{image_version}",
81+
f"assessment-function-{image_version}",
82+
f"processresults-function-{image_version}",
83+
f"hitl-wait-function-{image_version}",
84+
f"hitl-status-update-function-{image_version}",
85+
f"hitl-process-function-{image_version}",
86+
f"summarization-function-{image_version}",
87+
]
88+
89+
LOGGER.info(
90+
"verifying %d images in repository %s with version %s",
91+
len(required_images),
92+
repository_name,
93+
image_version,
94+
)
95+
96+
# Check each image
97+
for image_tag in required_images:
98+
try:
99+
response = ECR_CLIENT.describe_images(
100+
repositoryName=repository_name,
101+
imageIds=[{"imageTag": image_tag}]
102+
)
103+
104+
images = response.get("imageDetails", [])
105+
if not images:
106+
LOGGER.warning("image %s not found in ECR", image_tag)
107+
return False
108+
109+
# Check if image scan is complete (repository has ScanOnPush enabled)
110+
image = images[0]
111+
scan_status = image.get("imageScanStatus", {}).get("status")
112+
113+
if scan_status == "IN_PROGRESS":
114+
LOGGER.info("image %s scan still in progress", image_tag)
115+
return False
116+
117+
LOGGER.debug("image %s verified (scan status: %s)", image_tag, scan_status)
118+
119+
except ClientError as error:
120+
if error.response["Error"]["Code"] == "ImageNotFoundException":
121+
LOGGER.warning("image %s not found: %s", image_tag, error)
122+
return False
123+
LOGGER.error("error checking image %s: %s", image_tag, error)
124+
raise
125+
126+
LOGGER.info("all %d required images are available in ECR", len(required_images))
127+
return True
128+
129+
except Exception as exception: # pylint: disable=broad-except
130+
LOGGER.error("error verifying ECR images: %s", exception)
131+
return False
132+
133+
63134
@HELPER.poll_create
64135
@HELPER.poll_update
65136
def poll_create_or_update(event, _):
@@ -82,7 +153,28 @@ def poll_create_or_update(event, _):
82153
LOGGER.info("build status: [%s]", build_status)
83154

84155
if build_status == "SUCCEEDED":
85-
LOGGER.info("returning True")
156+
# Verify ECR images are available before returning success
157+
# This prevents Lambda functions from being created before images are pullable
158+
env_vars = build.get("environment", {}).get("environmentVariables", [])
159+
160+
# Extract ECR URI and image version from build environment
161+
ecr_uri = next((v["value"] for v in env_vars if v["name"] == "ECR_URI"), None)
162+
image_version = next((v["value"] for v in env_vars if v["name"] == "IMAGE_VERSION"), None)
163+
164+
if ecr_uri and image_version:
165+
LOGGER.info("verifying ECR images are available and pullable...")
166+
if _verify_ecr_images_available(ecr_uri, image_version):
167+
LOGGER.info("ECR image verification complete - returning True")
168+
return True
169+
170+
LOGGER.info("ECR images not yet available - returning None to poll again")
171+
return None
172+
173+
# Fallback: if we can't extract variables, proceed without verification
174+
LOGGER.warning(
175+
"could not extract ECR_URI or IMAGE_VERSION from build environment, "
176+
"proceeding without ECR verification"
177+
)
86178
return True
87179

88180
if build_status == "IN_PROGRESS":

0 commit comments

Comments
 (0)