Skip to content

Commit ecb804f

Browse files
committed
Support on-demand content in repair_metadata
closes #849
1 parent b273572 commit ecb804f

File tree

4 files changed

+242
-32
lines changed

4 files changed

+242
-32
lines changed

CHANGES/849.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for on-demand content to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 85 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
import logging
2+
from collections import defaultdict
23
from gettext import gettext as _
4+
from itertools import groupby
35
from uuid import UUID
46

7+
from django.db.models import Prefetch
58
from django.db.models.query import QuerySet
69
from pulp_python.app.models import PythonPackageContent, PythonRepository
7-
from pulp_python.app.utils import artifact_to_python_content_data
8-
from pulpcore.plugin.models import ProgressReport
9-
from pulpcore.plugin.util import get_domain
10+
from pulp_python.app.utils import (
11+
artifact_to_python_content_data,
12+
fetch_json_release_metadata,
13+
parse_metadata,
14+
)
15+
from pulpcore.plugin.models import ContentArtifact, ProgressReport
16+
from pulpcore.plugin.util import get_domain, get_prn
1017

1118
log = logging.getLogger(__name__)
1219

@@ -34,11 +41,16 @@ def repair(repository_pk: UUID) -> None:
3441
content_set = repository.latest_version().content.values_list("pk", flat=True)
3542
content = PythonPackageContent.objects.filter(pk__in=content_set)
3643

37-
num_repaired = repair_metadata(content)
38-
log.info(_("{} packages' metadata repaired.").format(num_repaired))
44+
num_repaired, pkgs_not_repaired = repair_metadata(content)
45+
log.info(
46+
_(
47+
"{} packages' metadata repaired. Not repaired packages due to either "
48+
"inaccessible URL or mismatched sha256: {}."
49+
).format(num_repaired, pkgs_not_repaired)
50+
)
3951

4052

41-
def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
53+
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]:
4254
"""
4355
Repairs metadata for a queryset of PythonPackageContent objects
4456
and updates the progress report.
@@ -47,24 +59,38 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4759
content (QuerySet[PythonPackageContent]): The queryset of items to repair.
4860
4961
Returns:
50-
int: The number of packages that were repaired.
62+
tuple[int, set[str]]: A tuple containing:
63+
- The number of packages that were repaired.
64+
- A set of packages' PRNs that were not repaired.
5165
"""
52-
# TODO: Add on_demand content repair
5366
immediate_content = (
5467
content.filter(contentartifact__artifact__isnull=False)
5568
.distinct()
5669
.prefetch_related("_artifacts")
5770
)
71+
on_demand_content = (
72+
content.filter(contentartifact__artifact__isnull=True)
73+
.distinct()
74+
.prefetch_related(
75+
Prefetch(
76+
"contentartifact_set",
77+
queryset=ContentArtifact.objects.prefetch_related("remoteartifact_set"),
78+
)
79+
)
80+
.order_by("name", "version")
81+
)
5882
domain = get_domain()
5983

6084
batch = []
6185
set_of_update_fields = set()
6286
total_repaired = 0
87+
# Keep track of on-demand packages that were not repaired
88+
pkgs_not_repaired = set()
6389

6490
progress_report = ProgressReport(
6591
message="Repairing packages' metadata",
6692
code="repair.metadata",
67-
total=immediate_content.count(),
93+
total=content.count(),
6894
)
6995
progress_report.save()
7096
with progress_report:
@@ -78,11 +104,60 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
78104
package, new_data, batch, set_of_update_fields
79105
)
80106

107+
# For on-demand content, we expect that:
108+
# 1. PythonPackageContent always has correct name and version
109+
# 2. RemoteArtifact always has correct sha256
110+
for (name, version), group in groupby(
111+
on_demand_content.iterator(chunk_size=BULK_SIZE),
112+
key=lambda x: (x.name, x.version),
113+
):
114+
group_set = set(group)
115+
grouped_by_url = defaultdict(list)
116+
117+
for package in group_set:
118+
for ra in package.contentartifact_set.get().remoteartifact_set.all():
119+
grouped_by_url[ra.remote.url].append((package, ra))
120+
121+
# Prioritize the URL that can serve the most packages
122+
for url, pkg_ra_pairs in sorted(
123+
grouped_by_url.items(), key=lambda x: len(x[1]), reverse=True
124+
):
125+
if not group_set:
126+
break # No packages left to repair, move onto the next group
127+
remotes = set([pkg_ra[1].remote for pkg_ra in pkg_ra_pairs])
128+
try:
129+
json_data = fetch_json_release_metadata(name, version, remotes)
130+
except Exception:
131+
continue
132+
133+
for package, ra in pkg_ra_pairs:
134+
if package not in group_set:
135+
continue # Package was already repaired
136+
# Extract data only for the specific distribution being checked
137+
dist_data = None
138+
for dist in json_data["urls"]:
139+
if ra.sha256 == dist["digests"]["sha256"]:
140+
dist_data = dist
141+
break
142+
if not dist_data:
143+
continue
144+
145+
new_data = parse_metadata(json_data["info"], version, dist_data)
146+
new_data.pop("url") # url belongs to RemoteArtifact
147+
total_repaired += update_package_if_needed(
148+
package, new_data, batch, set_of_update_fields
149+
)
150+
group_set.remove(package)
151+
progress_report.increment()
152+
# Store and track the unrepaired packages after all URLs are processed
153+
pkgs_not_repaired.update([get_prn(p) for p in group_set])
154+
progress_report.increase_by(len(group_set))
155+
81156
if batch:
82157
total_repaired += len(batch)
83158
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
84159

85-
return total_repaired
160+
return total_repaired, pkgs_not_repaired
86161

87162

88163
def update_package_if_needed(

pulp_python/app/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from packaging.utils import canonicalize_name
1010
from packaging.requirements import Requirement
1111
from packaging.version import parse, InvalidVersion
12+
from pulpcore.plugin.models import Remote
1213

1314

1415
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
@@ -189,6 +190,37 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189190
return data
190191

191192

193+
def fetch_json_release_metadata(name: str, version: str, remotes: set[Remote]) -> dict:
194+
"""
195+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
196+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
197+
All remotes should have the same URL.
198+
199+
Returns:
200+
Dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
201+
Raises:
202+
Exception if fetching from all remote URLs fails.
203+
"""
204+
remote = next(iter(remotes))
205+
url = remote.get_remote_artifact_url(f"pypi/{name}/{version}/json")
206+
207+
result = None
208+
for remote in remotes:
209+
downloader = remote.get_downloader(url=url, max_retries=1)
210+
try:
211+
result = downloader.fetch()
212+
break
213+
except Exception:
214+
continue
215+
216+
if result:
217+
with open(result.path, "r") as file:
218+
json_data = json.load(file)
219+
return json_data
220+
else:
221+
raise Exception(f"Failed to fetch {url} from any remote.")
222+
223+
192224
def python_content_to_json(base_path, content_query, version=None, domain=None):
193225
"""
194226
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

0 commit comments

Comments
 (0)