Skip to content

Commit 2b14e83

Browse files
committed
Support on-demand content in repair_metadata
closes #849
1 parent 134afaa commit 2b14e83

File tree

6 files changed

+386
-5
lines changed

6 files changed

+386
-5
lines changed

CHANGES/849.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for on-demand content to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 99 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,18 @@
22
import uuid
33
from gettext import gettext as _
44

5+
from requests.exceptions import RequestException
56
from django.db.models.query import QuerySet
67
from pulpcore.plugin.models import ProgressReport
78
from pulpcore.plugin.util import get_domain
89

910
from pulp_python.app.models import PythonPackageContent, PythonRepository
10-
from pulp_python.app.utils import artifact_to_python_content_data
11+
from pulp_python.app.utils import (
12+
artifact_to_python_content_data,
13+
fetch_json_release_metadata,
14+
parse_metadata,
15+
)
16+
from itertools import groupby
1117

1218
log = logging.getLogger(__name__)
1319

@@ -47,8 +53,17 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4753
Returns:
4854
int: The number of packages that were repaired.
4955
"""
50-
# TODO: Add on_demand content repair
51-
immediate_content = content.filter(contentartifact__artifact__isnull=False)
56+
immediate_content = (
57+
content.filter(contentartifact__artifact__isnull=False)
58+
.distinct()
59+
.prefetch_related("_artifacts")
60+
)
61+
on_demand_content = (
62+
content.filter(contentartifact__artifact__isnull=True)
63+
.distinct()
64+
.prefetch_related("contentartifact_set__remoteartifact_set")
65+
.order_by("name", "version")
66+
)
5267
domain = get_domain()
5368

5469
batch = []
@@ -58,12 +73,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
5873
progress_report = ProgressReport(
5974
message="Repairing packages' metadata",
6075
code="repair.metadata",
61-
total=immediate_content.count(),
76+
total=content.count(),
6277
)
6378
progress_report.save()
6479
with progress_report:
6580
for package in progress_report.iter(
66-
immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000)
81+
immediate_content.iterator(chunk_size=1000)
6782
):
6883
new_data = artifact_to_python_content_data(
6984
package.filename, package._artifacts.get(), domain
@@ -82,6 +97,85 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
8297
batch = []
8398
set_of_update_fields.clear()
8499

100+
# For on-demand content, we expect that:
101+
# 1. PythonPackageContent always has correct name and version, and one ContentArtifact
102+
# 2. RemoteArtifact always has correct sha256
103+
# Repair is only supported if all PythonPackageContent items with the same name and
104+
# version (i.e. group) share the same remote URL. Otherwise, the entire group is skipped
105+
for (name, version), group in groupby(
106+
on_demand_content.iterator(chunk_size=1000),
107+
key=lambda x: (x.name, x.version),
108+
):
109+
group = list(group)
110+
remotes = set(
111+
remote
112+
for content in group
113+
for remote in content.contentartifact_set.get()
114+
.remoteartifact_set.all()
115+
.values_list("remote__url", flat=True)
116+
)
117+
if len(remotes) != 1:
118+
log.warning(
119+
_("Only one remote url is supported for {} {}").format(
120+
name, version
121+
)
122+
)
123+
continue
124+
remote_url = remotes.pop()
125+
126+
# Retrieve data with all distributions for the given package version
127+
try:
128+
json_data = fetch_json_release_metadata(name, version, remote_url)
129+
except RequestException as exc:
130+
log.warning(
131+
_("Could not fetch metadata for {} {} from {}. Error: {}").format(
132+
name, version, remote_url, exc
133+
)
134+
)
135+
continue
136+
137+
for package in progress_report.iter(group):
138+
remote_artifacts = (
139+
package.contentartifact_set.get().remoteartifact_set.all()
140+
)
141+
# Extract data only for the specific distribution being checked
142+
dist_data = next(
143+
(
144+
dist
145+
for ra in remote_artifacts
146+
for dist in json_data["urls"]
147+
if ra.sha256 == dist["digests"]["sha256"]
148+
),
149+
None,
150+
)
151+
if not dist_data:
152+
log.warning(
153+
_(
154+
"Could not fetch distribution for {} {} with sha256 {}."
155+
).format(name, version, package.sha256)
156+
)
157+
continue
158+
159+
new_data = parse_metadata(json_data["info"], package.version, dist_data)
160+
new_data.pop("url") # belongs to RemoteArtifact
161+
new_data["pulp_domain"] = domain
162+
new_data["_pulp_domain"] = new_data["pulp_domain"]
163+
changed = False
164+
for field, value in new_data.items():
165+
if getattr(package, field) != value:
166+
setattr(package, field, value)
167+
set_of_update_fields.add(field)
168+
changed = True
169+
if changed:
170+
batch.append(package)
171+
if len(batch) == 1000:
172+
total_repaired += len(batch)
173+
PythonPackageContent.objects.bulk_update(
174+
batch, set_of_update_fields
175+
)
176+
batch = []
177+
set_of_update_fields.clear()
178+
85179
if batch:
86180
total_repaired += len(batch)
87181
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)

pulp_python/app/utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pkginfo
22
import re
3+
import requests
34
import shutil
45
import tempfile
56
import json
@@ -189,6 +190,19 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
189190
return data
190191

191192

193+
def fetch_json_release_metadata(name: str, version: str, remote_url: str) -> dict:
194+
"""
195+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
196+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
197+
198+
Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
199+
"""
200+
url = f"{remote_url}pypi/{name}/{version}/json"
201+
response = requests.get(url, timeout=10)
202+
response.raise_for_status()
203+
return response.json()
204+
205+
192206
def python_content_to_json(base_path, content_query, version=None, domain=None):
193207
"""
194208
Converts a QuerySet of PythonPackageContent into the PyPi JSON format

pulp_python/t_model.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from django.db import models
2+
from pulpcore.plugin.models import Content
3+
4+
from pulpcore.plugin.util import get_domain_pk
5+
from pulp_python.app.models import PACKAGE_TYPES, NormalizeName
6+
7+
8+
class TestClass:
9+
## Required
10+
metadata_version = models.TextField() # mandatory
11+
name = models.TextField()
12+
name.register_lookup(NormalizeName)
13+
version = models.TextField()
14+
15+
## Optional
16+
# not: Dynamic, License-Expression, License-File, Provides-Extra,
17+
platform = models.TextField()
18+
supported_platform = models.TextField()
19+
summary = models.TextField()
20+
description = models.TextField()
21+
description_content_type = models.TextField()
22+
keywords = models.TextField()
23+
author = models.TextField()
24+
author_email = models.TextField()
25+
maintainer = models.TextField()
26+
maintainer_email = models.TextField()
27+
license = models.TextField()
28+
classifiers = models.JSONField(default=list)
29+
requires_dist = models.JSONField(default=list)
30+
requires_python = models.TextField()
31+
requires_external = models.JSONField(default=list)
32+
project_url = models.TextField()
33+
provides_dist = models.JSONField(default=list)
34+
obsoletes_dist = models.JSONField(default=list)
35+
## not specified in core metadata, but should be
36+
project_urls = models.JSONField(default=dict)
37+
38+
## Deprecated
39+
# not: Requires, Provides, Obsoletes,
40+
home_page = models.TextField()
41+
download_url = models.TextField()
42+
43+
############### NOT MATCHING "Core metadata specifications"
44+
## From the Content model
45+
PROTECTED_FROM_RECLAIM = False
46+
TYPE = "python"
47+
repo_key_fields = ("filename",)
48+
_pulp_domain = models.ForeignKey(
49+
"core.Domain", default=get_domain_pk, on_delete=models.PROTECT
50+
)
51+
52+
## ??
53+
# Required metadata
54+
filename = models.TextField(db_index=True)
55+
packagetype = models.TextField(choices=PACKAGE_TYPES)
56+
sha256 = models.CharField(db_index=True, max_length=64)
57+
# Optional metadata
58+
python_version = models.TextField()
59+
60+
61+
class PythonPackageContent(Content):
62+
"""
63+
A Content Type representing Python's Distribution Package.
64+
65+
As defined in pep-0426 and pep-0345.
66+
67+
https://www.python.org/dev/peps/pep-0491/
68+
https://www.python.org/dev/peps/pep-0345/
69+
"""
70+
71+
# todo: 426 withdrawn, replaced by https://packaging.python.org/en/latest/specifications/core-metadata/#core-metadata
72+
# todo: 345 was before 426
73+
# todo: 491 OK, The Wheel Binary Package Format 1.9

pulp_python/test.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import json
2+
import re
3+
import shutil
4+
import tempfile
5+
from collections import defaultdict
6+
7+
import pkginfo
8+
import requests
9+
from django.conf import settings
10+
from jinja2 import Template
11+
from packaging.requirements import Requirement
12+
from packaging.utils import canonicalize_name
13+
from packaging.version import InvalidVersion, parse
14+
from pulpcore.plugin.models import Remote
15+
16+
17+
def fetch_json_release_metadata(
18+
name: str, version: str, remote: Remote, session_obj: Session
19+
) -> dict:
20+
"""
21+
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
22+
multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details.
23+
24+
Returns dict containing "info", "last_serial", "urls", and "vulnerabilities" keys.
25+
"""
26+
url = f"{remote.url}pypi/{name}/{version}0/json"
27+
response = session_obj.get(url, timeout=10)
28+
response.raise_for_status()
29+
return response.json()
30+
31+
32+
def test_metadata_repair_endpoint_on_demand(
33+
create_content_remote,
34+
monitor_task,
35+
move_to_repository,
36+
python_bindings,
37+
python_remote_factory,
38+
python_repo_factory,
39+
):
40+
python_egg_filename = "scipy-1.1.0.tar.gz"
41+
python_egg_url = urljoin(
42+
urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename
43+
)
44+
python_egg_sha256 = (
45+
"878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1"
46+
)
47+
data = {
48+
"name": "scipy",
49+
"version": "1.1.0",
50+
# Wrong metadata
51+
"author": "ME",
52+
"packagetype": "bdist",
53+
"requires_python": ">=3.8",
54+
}
55+
remote = python_remote_factory(includes=["scipy"])
56+
repo = python_repo_factory(remote=remote)
57+
58+
content = create_content_remote(
59+
python_egg_filename, python_egg_url, python_egg_sha256, data, remote
60+
)
61+
move_to_repository(repo.pulp_href, [content.pulp_href])
62+
63+
response = python_bindings.RepositoriesPythonApi.repair_metadata(repo.pulp_href)
64+
# monitor_task(response.task)
65+
66+
new_content = python_bindings.ContentPackagesApi.read(content.pulp_href)
67+
assert new_content.author == ""
68+
assert new_content.packagetype == "sdist"
69+
assert new_content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
70+
assert new_content.version == "1.1.0"

0 commit comments

Comments
 (0)