Skip to content

Commit 1cdff86

Browse files
committed
Support on-demand content in repair_metadata
closes #849
1 parent 449b71c commit 1cdff86

File tree

4 files changed

+236
-30
lines changed

4 files changed

+236
-30
lines changed

CHANGES/849.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for on-demand content to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
import logging
22
from uuid import UUID
3+
from collections import defaultdict
34
from gettext import gettext as _
5+
from itertools import groupby
46

57
from django.db.models.query import QuerySet
68
from pulpcore.plugin.models import ProgressReport
79
from pulpcore.plugin.util import get_domain
810

911
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import artifact_to_python_content_data
12+
from pulp_python.app.utils import (
13+
artifact_to_python_content_data,
14+
fetch_json_release_metadata,
15+
parse_metadata,
16+
)
1117

1218
log = logging.getLogger(__name__)
1319

@@ -35,11 +41,18 @@ def repair(repository_pk: UUID) -> None:
3541
content_set = repository.latest_version().content.values_list("pk", flat=True)
3642
content = PythonPackageContent.objects.filter(pk__in=content_set)
3743

38-
num_repaired = repair_metadata(content)
39-
log.info(_("{} packages' metadata repaired.").format(num_repaired))
44+
num_repaired, pkgs_not_repaired = repair_metadata(content)
45+
log.info(
46+
_(
47+
"{} packages' metadata repaired. Not repaired packages due to either "
48+
"inaccessible URL or mismatched sha256: {}."
49+
).format(num_repaired, pkgs_not_repaired)
50+
)
4051

4152

42-
def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
53+
def repair_metadata(
54+
content: QuerySet[PythonPackageContent],
55+
) -> tuple[int, set[PythonPackageContent]]:
4356
"""
4457
Repairs metadata for a queryset of PythonPackageContent objects
4558
and updates the progress report.
@@ -48,24 +61,33 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4861
content (QuerySet[PythonPackageContent]): The queryset of items to repair.
4962
5063
Returns:
51-
int: The number of packages that were repaired.
64+
tuple[int, set[PythonPackageContent]]: A tuple containing:
65+
- The number of packages that were repaired.
66+
- A set of packages that were not repaired.
5267
"""
53-
# TODO: Add on_demand content repair
5468
immediate_content = (
5569
content.filter(contentartifact__artifact__isnull=False)
5670
.distinct()
5771
.prefetch_related("_artifacts")
5872
)
73+
on_demand_content = (
74+
content.filter(contentartifact__artifact__isnull=True)
75+
.distinct()
76+
.prefetch_related("contentartifact_set__remoteartifact_set")
77+
.order_by("name", "version")
78+
)
5979
domain = get_domain()
6080

6181
batch = []
6282
set_of_update_fields = set()
6383
total_repaired = 0
84+
# Keep track of on-demand packages that were not repaired
85+
pkgs_not_repaired = set()
6486

6587
progress_report = ProgressReport(
6688
message="Repairing packages' metadata",
6789
code="repair.metadata",
68-
total=immediate_content.count(),
90+
total=content.count(),
6991
)
7092
progress_report.save()
7193
with progress_report:
@@ -79,11 +101,60 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
79101
package, new_data, batch, set_of_update_fields, total_repaired
80102
)
81103

104+
# For on-demand content, we expect that:
105+
# 1. PythonPackageContent always has correct name and version
106+
# 2. RemoteArtifact always has correct sha256
107+
for (name, version), group in groupby(
108+
on_demand_content.iterator(chunk_size=BULK_SIZE),
109+
key=lambda x: (x.name, x.version),
110+
):
111+
group_set = set(group)
112+
grouped_by_url = defaultdict(list)
113+
114+
for package in group_set:
115+
for ra in package.contentartifact_set.get().remoteartifact_set.all():
116+
grouped_by_url[ra.remote.url].append((package, ra))
117+
118+
# Prioritize the URL that can serve the most packages
119+
for url, pkg_ra_pairs in sorted(
120+
grouped_by_url.items(), key=lambda x: len(x[1]), reverse=True
121+
):
122+
if not group_set:
123+
break # No packages left to repair, move onto the next group
124+
remotes = set([pkg_ra[1].remote for pkg_ra in pkg_ra_pairs])
125+
try:
126+
json_data = fetch_json_release_metadata(name, version, remotes)
127+
except Exception:
128+
continue
129+
130+
for package, ra in pkg_ra_pairs:
131+
if package not in group_set:
132+
continue # Package was already repaired
133+
# Extract data only for the specific distribution being checked
134+
dist_data = None
135+
for dist in json_data["urls"]:
136+
if ra.sha256 == dist["digests"]["sha256"]:
137+
dist_data = dist
138+
break
139+
if not dist_data:
140+
continue
141+
142+
new_data = parse_metadata(json_data["info"], version, dist_data)
143+
new_data.pop("url") # url belongs to RemoteArtifact
144+
total_repaired = update_package_if_needed(
145+
package, new_data, batch, set_of_update_fields, total_repaired
146+
)
147+
group_set.remove(package)
148+
progress_report.increment()
149+
# Store and track the unrepaired packages after all URLs are processed
150+
pkgs_not_repaired.update(group_set)
151+
progress_report.increase_by(len(group_set))
152+
82153
if batch:
83154
total_repaired += len(batch)
84155
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
85156

86-
return total_repaired
157+
return total_repaired, pkgs_not_repaired
87158

88159

89160
def update_package_if_needed(

pulp_python/app/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from packaging.utils import canonicalize_name
1010
from packaging.requirements import Requirement
1111
from packaging.version import parse, InvalidVersion
12+
from pulpcore.plugin.models import Remote
1213

1314

1415
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
@@ -189,6 +190,37 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189190
return data
190191

191192

193+
def fetch_json_release_metadata(name: str, version: str, remotes: set[Remote]) -> dict:
194+
"""
195+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
196+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
197+
All remotes should have the same URL.
198+
199+
Returns:
200+
Dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
201+
Raises:
202+
Exception if fetching from all remote URLs fails.
203+
"""
204+
remote = next(iter(remotes))
205+
url = remote.get_remote_artifact_url(f"pypi/{name}/{version}/json")
206+
207+
result = None
208+
for remote in remotes:
209+
downloader = remote.get_downloader(url=url, max_retries=1)
210+
try:
211+
result = downloader.fetch()
212+
break
213+
except Exception:
214+
continue
215+
216+
if result:
217+
with open(result.path, "r") as file:
218+
json_data = json.load(file)
219+
return json_data
220+
else:
221+
raise Exception(f"Failed to fetch {url} from any remote.")
222+
223+
192224
def python_content_to_json(base_path, content_query, version=None, domain=None):
193225
"""
194226
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/tests/functional/api/test_repair.py

Lines changed: 124 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@
1010

1111
@pytest.fixture
1212
def create_content_direct(python_bindings):
13-
def _create(artifact_filename, filename, content_data):
13+
def _create(artifact_filename, content_data):
1414
commands = (
1515
"from pulpcore.plugin.models import Artifact, ContentArtifact; "
1616
"from pulpcore.plugin.util import get_url; "
1717
"from pulp_python.app.models import PythonPackageContent; "
1818
f"a = Artifact.init_and_validate('{artifact_filename}'); "
1919
"a.save(); "
20-
f"c = PythonPackageContent(sha256=a.sha256, filename={filename!r}, **{content_data!r}); " # noqa: E501
20+
f"c = PythonPackageContent(sha256=a.sha256, **{content_data!r}); "
2121
"c.save(); "
22-
f"ca = ContentArtifact(artifact=a, content=c, relative_path={filename!r}); "
22+
f"ca = ContentArtifact(artifact=a, content=c, relative_path=c.filename); "
2323
"ca.save(); "
2424
"print(get_url(c))"
2525
)
@@ -32,6 +32,39 @@ def _create(artifact_filename, filename, content_data):
3232
return _create
3333

3434

35+
@pytest.fixture
36+
def create_content_remote(python_bindings):
37+
def _create(content, remote, remote_2=None):
38+
commands = (
39+
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
40+
"from pulpcore.plugin.util import extract_pk, get_url; "
41+
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
42+
f"c = PythonPackageContent(**{content!r}); "
43+
"c.save(); "
44+
f"ca = ContentArtifact(content=c, relative_path=c.filename); "
45+
"ca.save(); "
46+
f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
47+
f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256=c.sha256); "
48+
"ra.save(); "
49+
)
50+
if remote_2:
51+
commands += (
52+
f"r2 = PythonRemote.objects.get(pk=extract_pk({remote_2.pulp_href!r})); "
53+
f"ra2 = RemoteArtifact(content_artifact=ca, remote=r2, sha256=c.sha256); "
54+
"ra2.save(); "
55+
)
56+
commands += "print(get_url(c))"
57+
process = subprocess.run(
58+
["pulpcore-manager", "shell", "-c", commands], capture_output=True
59+
)
60+
61+
assert process.returncode == 0
62+
content_href = process.stdout.decode().strip()
63+
return python_bindings.ContentPackagesApi.read(content_href)
64+
65+
return _create
66+
67+
3568
@pytest.fixture
3669
def move_to_repository(python_bindings, monitor_task):
3770
def _move(repo_href, content_hrefs):
@@ -54,13 +87,14 @@ def test_metadata_repair_command(
5487
"""Test pulpcore-manager repair-python-metadata command."""
5588
data = {
5689
"name": "shelf-reader",
90+
"filename": PYTHON_EGG_FILENAME,
5791
# Wrong metadata
5892
"version": "0.2",
5993
"packagetype": "bdist",
6094
"requires_python": ">=3.8",
6195
"author": "ME",
6296
}
63-
content = create_content_direct(python_file, PYTHON_EGG_FILENAME, data)
97+
content = create_content_direct(python_file, data)
6498
for field, wrong_value in data.items():
6599
if field == "python_version":
66100
continue
@@ -84,43 +118,111 @@ def test_metadata_repair_command(
84118

85119
def test_metadata_repair_endpoint(
86120
create_content_direct,
121+
create_content_remote,
122+
delete_orphans_pre,
87123
download_python_file,
88124
monitor_task,
89125
move_to_repository,
90126
python_bindings,
91-
python_repo,
127+
python_remote_factory,
128+
python_repo_factory,
92129
):
93130
"""
94131
Test repairing of package metadata via `Repositories.repair_metadata` endpoint.
95132
"""
96-
python_egg_filename = "scipy-1.1.0.tar.gz"
97-
python_egg_url = urljoin(
98-
urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename
133+
# 1. Setup tested data
134+
# Shared data
135+
python_remote = python_remote_factory()
136+
python_remote_bad = python_remote_factory(url="https://fixtures.pulpproject.org/")
137+
python_repo = python_repo_factory(remote=python_remote)
138+
139+
# Immediate content
140+
scipy_egg_filename = "scipy-1.1.0-cp27-none-win32.whl"
141+
scipy_egg_url = urljoin(
142+
urljoin(PYTHON_FIXTURES_URL, "packages/"), scipy_egg_filename
99143
)
100-
python_file = download_python_file(python_egg_filename, python_egg_url)
144+
scipy_file = download_python_file(scipy_egg_filename, scipy_egg_url)
145+
scipy_data_0 = {
146+
"filename": scipy_egg_filename,
147+
"name": "scipy",
148+
"version": "1.1.0",
149+
# Wrong metadata
150+
"author": "ME",
151+
"packagetype": "bdist",
152+
"requires_python": ">=3.8",
153+
}
101154

102-
data = {
155+
# On-demand content
156+
celery_data = {
157+
"filename": "celery-2.4.1.tar.gz",
158+
"name": "celery",
159+
"version": "2.4.1",
160+
"sha256": "c77652ca179d14473975822dbfb1b5dab950c88c171ef6bc2257ddb9066e6790",
161+
# Wrong metadata
162+
"author": "ME",
163+
"packagetype": "bdist",
164+
"requires_python": ">=3.8",
165+
}
166+
167+
scipy_data_1 = {
168+
"filename": "scipy-1.1.0.tar.gz",
103169
"name": "scipy",
170+
"version": "1.1.0",
171+
"sha256": "878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1",
104172
# Wrong metadata
105173
"author": "ME",
106174
"packagetype": "bdist",
107175
"requires_python": ">=3.8",
108-
"version": "0.2",
109176
}
110-
content = create_content_direct(python_file, python_egg_filename, data)
111-
for field, wrong_value in data.items():
112-
if field == "python_version":
113-
continue
114-
assert getattr(content, field) == wrong_value
115-
move_to_repository(python_repo.pulp_href, [content.pulp_href])
116177

178+
scipy_data_2 = scipy_data_1.copy()
179+
scipy_data_2["filename"] = "scipy-1.1.0-cp36-none-win32.whl"
180+
scipy_data_2["sha256"] = (
181+
"0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3"
182+
)
183+
184+
# 2. Create content
185+
celery_content = create_content_remote(celery_data, python_remote)
186+
scipy_content_0 = create_content_direct(scipy_file, scipy_data_0)
187+
scipy_content_1 = create_content_remote(
188+
scipy_data_1, python_remote, python_remote_bad
189+
)
190+
scipy_content_2 = create_content_remote(scipy_data_2, python_remote_bad)
191+
192+
content_hrefs = {}
193+
for data, content in [
194+
(celery_data, celery_content),
195+
(scipy_data_0, scipy_content_0),
196+
(scipy_data_1, scipy_content_1),
197+
(scipy_data_2, scipy_content_2),
198+
]:
199+
for field, test_value in data.items():
200+
assert getattr(content, field) == test_value
201+
content_hrefs[data["filename"]] = content.pulp_href
202+
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))
203+
204+
# 3. Repair metadata
117205
response = python_bindings.RepositoriesPythonApi.repair_metadata(
118206
python_repo.pulp_href
119207
)
120208
monitor_task(response.task)
121209

122-
content = python_bindings.ContentPackagesApi.read(content.pulp_href)
123-
assert content.version == "1.1.0"
124-
assert content.packagetype == "sdist"
125-
assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
126-
assert content.author == ""
210+
# 4. Check new metadata
211+
new_metadata = [
212+
# repaired
213+
("celery-2.4.1.tar.gz", "Ask Solem", "sdist", ""),
214+
(
215+
"scipy-1.1.0-cp27-none-win32.whl",
216+
"",
217+
"bdist_wheel",
218+
">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
219+
),
220+
("scipy-1.1.0.tar.gz", "", "sdist", ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"),
221+
# not repaired
222+
("scipy-1.1.0-cp36-none-win32.whl", "ME", "bdist", ">=3.8"),
223+
]
224+
for filename, author, packagetype, requires_python in new_metadata:
225+
new_content = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
226+
assert new_content.author == author
227+
assert new_content.packagetype == packagetype
228+
assert new_content.requires_python == requires_python

0 commit comments

Comments
 (0)