Skip to content

Commit 7c3cb5c

Browse files
committed
Support on-demand content in repair_metadata
closes #849
1 parent 1c75b34 commit 7c3cb5c

File tree

4 files changed

+251
-14
lines changed

4 files changed

+251
-14
lines changed

CHANGES/849.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for on-demand content to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 93 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,19 @@
22
import uuid
33
from gettext import gettext as _
44

5+
from requests.exceptions import RequestException
56
from django.db.models.query import QuerySet
67
from pulpcore.plugin.models import ProgressReport
78
from pulpcore.plugin.util import get_domain
89

910
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import artifact_to_python_content_data
11+
from pulp_python.app.utils import (
12+
artifact_to_python_content_data,
13+
fetch_json_release_metadata,
14+
parse_metadata,
15+
)
16+
from itertools import groupby
17+
from collections import defaultdict
1118

1219
log = logging.getLogger(__name__)
1320

@@ -32,11 +39,16 @@ def repair(repository_pk: uuid.UUID) -> None:
3239
content_set = repository.latest_version().content.values_list("pk", flat=True)
3340
content = PythonPackageContent.objects.filter(pk__in=content_set)
3441

35-
num_repaired = repair_metadata(content)
36-
log.info(_("{} packages' metadata repaired.").format(num_repaired))
42+
num_repaired, pkgs_not_repaired = repair_metadata(content)
43+
log.info(
44+
_(
45+
"{} packages' metadata repaired. Not repaired packages due to either "
46+
"inaccessible URL or mismatched sha256: {}."
47+
).format(num_repaired, pkgs_not_repaired)
48+
)
3749

3850

39-
def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
51+
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]:
4052
"""
4153
Repairs metadata for a queryset of PythonPackageContent objects
4254
and updates the progress report.
@@ -45,25 +57,39 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4557
content (QuerySet[PythonPackageContent]): The queryset of items to repair.
4658
4759
Returns:
48-
int: The number of packages that were repaired.
60+
tuple[int, set[str]]: A tuple containing:
61+
- The number of packages that were repaired.
62+
- A set of primary keys of packages that were not repaired.
4963
"""
50-
# TODO: Add on_demand content repair
51-
immediate_content = content.filter(contentartifact__artifact__isnull=False)
64+
immediate_content = (
65+
content.filter(contentartifact__artifact__isnull=False)
66+
.distinct()
67+
.prefetch_related("_artifacts")
68+
)
69+
on_demand_content = (
70+
content.filter(contentartifact__artifact__isnull=True)
71+
.distinct()
72+
.prefetch_related("contentartifact_set__remoteartifact_set")
73+
.order_by("name", "version")
74+
)
5275
domain = get_domain()
5376

5477
batch = []
5578
set_of_update_fields = set()
5679
total_repaired = 0
80+
# Keep track of on-demand packages that need to be repaired
81+
# A package is removed from this variable once it is successfully repaired
82+
pkgs_not_repaired = set(on_demand_content.values_list("pk", flat=True))
5783

5884
progress_report = ProgressReport(
5985
message="Repairing packages' metadata",
6086
code="repair.metadata",
61-
total=immediate_content.count(),
87+
total=content.count(),
6288
)
6389
progress_report.save()
6490
with progress_report:
6591
for package in progress_report.iter(
66-
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
92+
immediate_content.iterator(chunk_size=1000)
6793
):
6894
new_data = artifact_to_python_content_data(
6995
package.filename, package._artifacts.get(), domain
@@ -82,8 +108,65 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
82108
batch = []
83109
set_of_update_fields.clear()
84110

111+
# For on-demand content, we expect that:
112+
# 1. PythonPackageContent always has correct name and version, and one ContentArtifact
113+
# 2. RemoteArtifact always has correct sha256
114+
for (name, version), group in groupby(
115+
on_demand_content.iterator(chunk_size=1000),
116+
key=lambda x: (x.name, x.version),
117+
):
118+
group_list = list(group)
119+
grouped_by_url = defaultdict(list)
120+
121+
for package in group_list:
122+
for ra in package.contentartifact_set.get().remoteartifact_set.all():
123+
grouped_by_url[ra.remote.url].append((package, ra))
124+
125+
# Prioritize the URL that can serve the most packages
126+
for url, pkg_ra in sorted(
127+
grouped_by_url.items(), key=lambda x: len(x[1]), reverse=True
128+
):
129+
# All packages have the same URL, so pick a random remote
130+
remote = pkg_ra[0][1].remote
131+
try:
132+
json_data = fetch_json_release_metadata(name, version, remote)
133+
except RequestException:
134+
continue
135+
136+
for package, ra in pkg_ra:
137+
if package.pk not in pkgs_not_repaired:
138+
continue
139+
# Extract data only for the specific distribution being checked
140+
dist_data = None
141+
for dist in json_data["urls"]:
142+
if ra.sha256 == dist["digests"]["sha256"]:
143+
dist_data = dist
144+
break
145+
if not dist_data:
146+
continue
147+
148+
new_data = parse_metadata(json_data["info"], version, dist_data)
149+
# url belongs to RemoteArtifact
150+
new_data.pop("url")
151+
changed = False
152+
for field, value in new_data.items():
153+
if getattr(package, field) != value:
154+
setattr(package, field, value)
155+
set_of_update_fields.add(field)
156+
changed = True
157+
if changed:
158+
batch.append(package)
159+
if len(batch) == 1000:
160+
total_repaired += len(batch)
161+
PythonPackageContent.objects.bulk_update(
162+
batch, set_of_update_fields
163+
)
164+
batch = []
165+
set_of_update_fields.clear()
166+
pkgs_not_repaired.remove(package.pk)
167+
85168
if batch:
86169
total_repaired += len(batch)
87170
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
88171

89-
return total_repaired
172+
return total_repaired, pkgs_not_repaired

pulp_python/app/utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pkginfo
22
import re
3+
import requests
34
import shutil
45
import tempfile
56
import json
@@ -9,6 +10,7 @@
910
from packaging.utils import canonicalize_name
1011
from packaging.requirements import Requirement
1112
from packaging.version import parse, InvalidVersion
13+
from pulpcore.plugin.models import Remote
1214

1315

1416
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
@@ -189,6 +191,19 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189191
return data
190192

191193

194+
def fetch_json_release_metadata(name: str, version: str, remote: Remote) -> dict:
195+
"""
196+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
197+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
198+
199+
Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
200+
"""
201+
url = f"{remote.url}pypi/{name}/{version}/json"
202+
response = requests.get(url, timeout=10)
203+
response.raise_for_status()
204+
return response.json()
205+
206+
192207
def python_content_to_json(base_path, content_query, version=None, domain=None):
193208
"""
194209
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/tests/functional/api/test_repair.py

Lines changed: 142 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,42 @@ def _create(artifact_filename, filename, content_data):
2323
"ca.save(); "
2424
"print(get_url(c))"
2525
)
26-
process = subprocess.run(["pulpcore-manager", "shell", "-c", commands], capture_output=True)
26+
process = subprocess.run(
27+
["pulpcore-manager", "shell", "-c", commands], capture_output=True
28+
)
29+
30+
assert process.returncode == 0
31+
content_href = process.stdout.decode().strip()
32+
return python_bindings.ContentPackagesApi.read(content_href)
33+
34+
return _create
35+
36+
37+
@pytest.fixture
38+
def create_content_remote(python_bindings):
39+
def _create(filename, content, remote, remote_2=None):
40+
commands = (
41+
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
42+
"from pulpcore.plugin.util import extract_pk, get_url; "
43+
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
44+
f"c = PythonPackageContent(filename={filename!r}, **{content!r}); "
45+
"c.save(); "
46+
f"ca = ContentArtifact(content=c, relative_path={filename!r}); "
47+
"ca.save(); "
48+
f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
49+
f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256={content['sha256']!r}); "
50+
"ra.save(); "
51+
)
52+
if remote_2:
53+
commands += (
54+
f"r2 = PythonRemote.objects.get(pk=extract_pk({remote_2.pulp_href!r})); "
55+
f"ra2 = RemoteArtifact(content_artifact=ca, remote=r2, sha256={content['sha256']!r}); " # noqa: E501
56+
"ra2.save(); "
57+
)
58+
commands += "print(get_url(c))"
59+
process = subprocess.run(
60+
["pulpcore-manager", "shell", "-c", commands], capture_output=True
61+
)
2762

2863
assert process.returncode == 0
2964
content_href = process.stdout.decode().strip()
@@ -36,7 +71,9 @@ def _create(artifact_filename, filename, content_data):
3671
def move_to_repository(python_bindings, monitor_task):
3772
def _move(repo_href, content_hrefs):
3873
body = {"add_content_units": content_hrefs}
39-
task = monitor_task(python_bindings.RepositoriesPythonApi.modify(repo_href, body).task)
74+
task = monitor_task(
75+
python_bindings.RepositoriesPythonApi.modify(repo_href, body).task
76+
)
4077
assert len(task.created_resources) == 1
4178
return python_bindings.RepositoriesPythonApi.read(repo_href)
4279

@@ -68,8 +105,13 @@ def test_metadata_repair_command(
68105

69106
move_to_repository(python_repo.pulp_href, [content.pulp_href])
70107
process = subprocess.run(
71-
["pulpcore-manager", "repair-python-metadata", "--repositories", python_repo.pulp_href],
72-
capture_output=True
108+
[
109+
"pulpcore-manager",
110+
"repair-python-metadata",
111+
"--repositories",
112+
python_repo.pulp_href,
113+
],
114+
capture_output=True,
73115
)
74116
assert process.returncode == 0
75117
output = process.stdout.decode().strip()
@@ -84,6 +126,7 @@ def test_metadata_repair_command(
84126

85127
def test_metadata_repair_endpoint(
86128
create_content_direct,
129+
delete_orphans_pre,
87130
download_python_file,
88131
monitor_task,
89132
move_to_repository,
@@ -124,3 +167,98 @@ def test_metadata_repair_endpoint(
124167
assert content.packagetype == "sdist"
125168
assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
126169
assert content.author == ""
170+
171+
172+
def test_metadata_repair_endpoint_on_demand(
173+
create_content_remote,
174+
delete_orphans_pre,
175+
monitor_task,
176+
move_to_repository,
177+
python_bindings,
178+
python_remote_factory,
179+
python_repo_factory,
180+
):
181+
"""
182+
Test repairing of package metadata via `Repositories.repair_metadata` endpoint
183+
when only RemoteArtifacts are present.
184+
"""
185+
# 1. Set up tested data
186+
python_remote = python_remote_factory()
187+
python_remote_bad = python_remote_factory(url="https://fixtures.pulpproject.org/")
188+
python_repo = python_repo_factory(remote=python_remote)
189+
190+
scipy_filename_1 = "scipy-1.1.0.tar.gz"
191+
scipy_data_1 = {
192+
"name": "scipy",
193+
"version": "1.1.0",
194+
"sha256": "878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1",
195+
# Wrong metadata
196+
"author": "ME",
197+
"packagetype": "bdist",
198+
"requires_python": ">=3.8",
199+
}
200+
201+
scipy_filename_2 = "scipy-1.1.0-cp36-none-win32.whl"
202+
scipy_data_2 = scipy_data_1.copy()
203+
scipy_data_2["sha256"] = (
204+
"0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3"
205+
)
206+
207+
celery_filename = "celery-2.4.1.tar.gz"
208+
celery_data = {
209+
"name": "celery",
210+
"version": "2.4.1",
211+
"sha256": "c77652ca179d14473975822dbfb1b5dab950c88c171ef6bc2257ddb9066e6790",
212+
# Wrong metadata
213+
"author": "ME",
214+
"packagetype": "bdist",
215+
"requires_python": ">=3.8",
216+
}
217+
218+
# 2. Create content
219+
content_1 = create_content_remote(
220+
scipy_filename_1, scipy_data_1, python_remote, python_remote_bad
221+
)
222+
content_2 = create_content_remote(scipy_filename_2, scipy_data_2, python_remote_bad)
223+
content_3 = create_content_remote(celery_filename, celery_data, python_remote)
224+
225+
content_hrefs = {}
226+
for filename, data, content in [
227+
(scipy_filename_1, scipy_data_1, content_1),
228+
(scipy_filename_2, scipy_data_2, content_2),
229+
(celery_filename, celery_data, content_3),
230+
]:
231+
for field, test_value in data.items():
232+
assert getattr(content, field) == test_value
233+
content_hrefs[filename] = content.pulp_href
234+
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))
235+
236+
# 3. Repair metadata
237+
response = python_bindings.RepositoriesPythonApi.repair_metadata(
238+
python_repo.pulp_href
239+
)
240+
monitor_task(response.task)
241+
242+
# 4. Check new metadata
243+
new_metadata = [
244+
# repaired
245+
("celery-2.4.1.tar.gz", "celery", "2.4.1", "Ask Solem", "sdist", ""),
246+
(
247+
"scipy-1.1.0.tar.gz",
248+
"scipy",
249+
"1.1.0",
250+
"",
251+
"sdist",
252+
">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
253+
),
254+
# not repaired
255+
("scipy-1.1.0-cp36-none-win32.whl", "scipy", "1.1.0", "ME", "bdist", ">=3.8"),
256+
]
257+
for filename, name, version, author, packagetype, requires_python in new_metadata:
258+
new_content = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
259+
assert new_content.filename == filename
260+
assert new_content.name == name
261+
assert new_content.version == version
262+
assert new_content.author == author
263+
assert new_content.packagetype == packagetype
264+
assert new_content.requires_python == requires_python

0 commit comments

Comments
 (0)