Skip to content

Commit 0672fdf

Browse files
committed
Support on-demand content in repair_metadata
1 parent 134afaa commit 0672fdf

File tree

3 files changed

+214
-5
lines changed

3 files changed

+214
-5
lines changed

pulp_python/app/tasks/repair.py

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22
import uuid
33
from gettext import gettext as _
44

5+
from requests.exceptions import RequestException
56
from django.db.models.query import QuerySet
67
from pulpcore.plugin.models import ProgressReport
78
from pulpcore.plugin.util import get_domain
89

910
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import artifact_to_python_content_data
11+
from pulp_python.app.utils import (
12+
artifact_to_python_content_data,
13+
fetch_json_release_metadata,
14+
parse_metadata,
15+
)
1116

1217
log = logging.getLogger(__name__)
1318

@@ -47,23 +52,35 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4752
Returns:
4853
int: The number of packages that were repaired.
4954
"""
50-
# TODO: Add on_demand content repair
51-
immediate_content = content.filter(contentartifact__artifact__isnull=False)
55+
immediate_content = (
56+
content.filter(contentartifact__artifact__isnull=False)
57+
.distinct()
58+
.prefetch_related("_artifacts")
59+
)
60+
on_demand_content = (
61+
content.filter(contentartifact__artifact__isnull=True)
62+
.distinct()
63+
.prefetch_related("contentartifact_set__remoteartifact_set")
64+
.order_by("name")
65+
)
5266
domain = get_domain()
5367

5468
batch = []
5569
set_of_update_fields = set()
5670
total_repaired = 0
71+
# Used in on-demand content to prevent redundant requests
72+
pkg_name = None
73+
json_data = None
5774

5875
progress_report = ProgressReport(
5976
message="Repairing packages' metadata",
6077
code="repair.metadata",
61-
total=immediate_content.count(),
78+
total=content.count(),
6279
)
6380
progress_report.save()
6481
with progress_report:
6582
for package in progress_report.iter(
66-
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
83+
immediate_content.iterator(chunk_size=1000)
6784
):
6885
new_data = artifact_to_python_content_data(
6986
package.filename, package._artifacts.get(), domain
@@ -82,6 +99,62 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
8299
batch = []
83100
set_of_update_fields.clear()
84101

102+
for package in progress_report.iter(
103+
on_demand_content.iterator(chunk_size=1000)
104+
):
105+
remote_artifacts = (
106+
package.contentartifact_set.get().remoteartifact_set.all()
107+
)
108+
# Call the API only for new packages to prevent redundant requests
109+
if pkg_name != package.name:
110+
pkg_name = package.name
111+
# We expect that PythonPackageContent always has correct name and version
112+
try:
113+
json_data = fetch_json_release_metadata(
114+
package.name, package.version, remote_artifacts.get().remote
115+
)
116+
except RequestException as exc:
117+
log.warning(
118+
_(
119+
"Could not fetch metadata for {} {} from PyPI. Error: {}"
120+
).format(package.name, package.version, exc)
121+
)
122+
continue
123+
# Extract data only for the specific distribution being checked
124+
# We expect that RemoteArtifact always has correct sha256
125+
dist_data = next(
126+
(
127+
dist
128+
for ra in remote_artifacts
129+
for dist in json_data["urls"]
130+
if ra.sha256 == dist["digests"]["sha256"]
131+
),
132+
None,
133+
)
134+
if not dist_data:
135+
log.warning(
136+
_("No matching distribution for {} was found.").format(package.name)
137+
)
138+
continue
139+
140+
new_data = parse_metadata(json_data["info"], package.version, dist_data)
141+
new_data.pop("url") # belongs to RemoteArtifact, not PythonPackageContent
142+
new_data["pulp_domain"] = domain
143+
new_data["_pulp_domain"] = new_data["pulp_domain"]
144+
changed = False
145+
for field, value in new_data.items():
146+
if getattr(package, field) != value:
147+
setattr(package, field, value)
148+
set_of_update_fields.add(field)
149+
changed = True
150+
if changed:
151+
batch.append(package)
152+
if len(batch) == 1000:
153+
total_repaired += len(batch)
154+
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
155+
batch = []
156+
set_of_update_fields.clear()
157+
85158
if batch:
86159
total_repaired += len(batch)
87160
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)

pulp_python/app/utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pkginfo
22
import re
3+
import requests
34
import shutil
45
import tempfile
56
import json
@@ -9,6 +10,7 @@
910
from packaging.utils import canonicalize_name
1011
from packaging.requirements import Requirement
1112
from packaging.version import parse, InvalidVersion
13+
from pulpcore.plugin.models import Remote
1214

1315

1416
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
@@ -189,6 +191,19 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189191
return data
190192

191193

194+
def fetch_json_release_metadata(name: str, version: str, remote: Remote) -> dict:
195+
"""
196+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
197+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
198+
199+
Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
200+
"""
201+
url = f"{remote.url}pypi/{name}/{version}/json"
202+
response = requests.get(url, timeout=10)
203+
response.raise_for_status()
204+
return response.json()
205+
206+
192207
def python_content_to_json(base_path, content_query, version=None, domain=None):
193208
"""
194209
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/tests/functional/api/test_repair.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,33 @@ def _create(artifact_filename, filename, content_data):
3232
return _create
3333

3434

35+
@pytest.fixture
36+
def create_content_remote(python_bindings):
37+
def _create(filename, content_data, ra_sha256, remote):
38+
commands = (
39+
"from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; "
40+
"from pulpcore.plugin.util import extract_pk, get_url; "
41+
"from pulp_python.app.models import PythonPackageContent, PythonRemote; "
42+
f"c = PythonPackageContent(filename={filename!r}, **{content_data!r}); "
43+
"c.save(); "
44+
f"ca = ContentArtifact(content=c, relative_path={filename!r}); "
45+
"ca.save(); "
46+
f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); "
47+
f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256={ra_sha256!r}); "
48+
"ra.save(); "
49+
"print(get_url(c))"
50+
)
51+
process = subprocess.run(
52+
["pulpcore-manager", "shell", "-c", commands], capture_output=True
53+
)
54+
55+
assert process.returncode == 0
56+
content_href = process.stdout.decode().strip()
57+
return python_bindings.ContentPackagesApi.read(content_href)
58+
59+
return _create
60+
61+
3562
@pytest.fixture
3663
def move_to_repository(python_bindings, monitor_task):
3764
def _move(repo_href, content_hrefs):
@@ -84,6 +111,7 @@ def test_metadata_repair_command(
84111

85112
def test_metadata_repair_endpoint(
86113
create_content_direct,
114+
delete_orphans_pre,
87115
download_python_file,
88116
monitor_task,
89117
move_to_repository,
@@ -124,3 +152,96 @@ def test_metadata_repair_endpoint(
124152
assert content.packagetype == "sdist"
125153
assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
126154
assert content.author == ""
155+
156+
157+
def test_metadata_repair_endpoint_on_demand(
158+
create_content_remote,
159+
delete_orphans_pre,
160+
monitor_task,
161+
move_to_repository,
162+
python_bindings,
163+
python_remote_factory,
164+
python_repo_factory,
165+
):
166+
"""
167+
Test repairing of package metadata via `Repositories.repair_metadata` endpoint
168+
when only RemoteArtifacts are present.
169+
"""
170+
# 1. Set up tested data
171+
python_remote = python_remote_factory()
172+
python_repo = python_repo_factory(remote=python_remote)
173+
174+
scipy_filename_1 = "scipy-1.1.0.tar.gz"
175+
scipy_sha256_1 = "878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1"
176+
scipy_data_1 = {
177+
"name": "scipy",
178+
"version": "1.1.0",
179+
# Wrong metadata
180+
"author": "ME",
181+
"packagetype": "bdist",
182+
"requires_python": ">=3.8",
183+
"sha256": scipy_sha256_1,
184+
}
185+
186+
scipy_filename_2 = "scipy-1.1.0-cp36-none-win32.whl"
187+
scipy_sha256_2 = "0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3"
188+
scipy_data_2 = scipy_data_1.copy()
189+
scipy_data_2["sha256"] = scipy_sha256_2
190+
191+
celery_filename = "celery-2.4.1.tar.gz"
192+
celery_sha256 = "c77652ca179d14473975822dbfb1b5dab950c88c171ef6bc2257ddb9066e6790"
193+
celery_data = {
194+
"name": "celery",
195+
"version": "2.4.1",
196+
# Wrong metadata
197+
"author": "ME",
198+
"packagetype": "bdist",
199+
"requires_python": ">=3.8",
200+
}
201+
202+
# 2. Create content and store its href
203+
content_hrefs = {}
204+
for filename, data, sha256 in [
205+
(scipy_filename_1, scipy_data_1, scipy_sha256_1),
206+
(scipy_filename_2, scipy_data_2, scipy_sha256_2),
207+
(celery_filename, celery_data, celery_sha256),
208+
]:
209+
content = create_content_remote(filename, data, sha256, python_remote)
210+
for field, test_value in data.items():
211+
assert getattr(content, field) == test_value
212+
content_hrefs[filename] = content.pulp_href
213+
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))
214+
215+
# 3. Repair metadata
216+
response = python_bindings.RepositoriesPythonApi.repair_metadata(
217+
python_repo.pulp_href
218+
)
219+
monitor_task(response.task)
220+
221+
# 4. Check newly created metadata
222+
new_metadata = [
223+
(
224+
"scipy-1.1.0.tar.gz",
225+
"",
226+
"scipy",
227+
"sdist",
228+
">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
229+
"1.1.0",
230+
),
231+
(
232+
"scipy-1.1.0-cp36-none-win32.whl",
233+
"",
234+
"scipy",
235+
"bdist_wheel",
236+
">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
237+
"1.1.0",
238+
),
239+
("celery-2.4.1.tar.gz", "Ask Solem", "celery", "sdist", "", "2.4.1"),
240+
]
241+
for filename, author, name, packagetype, requires_python, version in new_metadata:
242+
new_content = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
243+
assert new_content.author == author
244+
assert new_content.name == name
245+
assert new_content.packagetype == packagetype
246+
assert new_content.requires_python == requires_python
247+
assert new_content.version == version

0 commit comments

Comments
 (0)