Skip to content

Commit ce6bf2c

Browse files
committed
Create and expose metadata file
1 parent 37626da commit ce6bf2c

File tree

11 files changed

+254
-16
lines changed

11 files changed

+254
-16
lines changed

pulp_python/app/management/commands/repair-python-metadata.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,14 @@ def repair_metadata(content):
2424
set_of_update_fields = set()
2525
total_repaired = 0
2626
for package in immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000):
27+
# Get the main artifact
28+
main_artifact = (
29+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
30+
.first()
31+
.artifact
32+
)
2733
new_data = artifact_to_python_content_data(
28-
package.filename, package._artifacts.get(), package.pulp_domain
34+
package.filename, main_artifact, package.pulp_domain
2935
)
3036
changed = False
3137
for field, value in new_data.items():

pulp_python/app/serializers.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
)
2323
from pulp_python.app.utils import (
2424
DIST_EXTENSIONS,
25+
artifact_to_metadata_artifact,
2526
artifact_to_python_content_data,
2627
get_project_metadata_from_file,
2728
parse_project_metadata,
@@ -93,11 +94,31 @@ class Meta:
9394
model = python_models.PythonDistribution
9495

9596

97+
class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
98+
"""
99+
Custom field with overridden get_attribute method. Meant to be used only in
100+
PythonPackageContentSerializer to handle possible existence of metadata artifact.
101+
"""
102+
103+
def get_attribute(self, instance):
104+
# When content has multiple artifacts (wheel + metadata), return the main one
105+
if instance._artifacts.count() > 1:
106+
for ca in instance.contentartifact_set.all():
107+
if not ca.relative_path.endswith(".metadata"):
108+
return ca.artifact
109+
110+
return super().get_attribute(instance)
111+
112+
96113
class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
97114
"""
98115
A Serializer for PythonPackageContent.
99116
"""
100117

118+
artifact = PythonSingleContentArtifactField(
119+
help_text=_("Artifact file representing the physical content"),
120+
)
121+
101122
# Core metadata
102123
# Version 1.0
103124
author = serializers.CharField(
@@ -386,8 +407,21 @@ def deferred_validate(self, data):
386407
if attestations := data.pop("attestations", None):
387408
data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)
388409

410+
# Create metadata artifact for wheel files
411+
if filename.endswith(".whl"):
412+
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
413+
data["metadata_artifact"] = metadata_artifact
414+
data["metadata_sha256"] = metadata_artifact.sha256
415+
389416
return data
390417

418+
def get_artifacts(self, validated_data):
419+
artifacts = super().get_artifacts(validated_data)
420+
if metadata_artifact := validated_data.pop("metadata_artifact", None):
421+
relative_path = f"{validated_data['filename']}.metadata"
422+
artifacts[relative_path] = metadata_artifact
423+
return artifacts
424+
391425
def retrieve(self, validated_data):
392426
content = python_models.PythonPackageContent.objects.filter(
393427
sha256=validated_data["sha256"], _pulp_domain=get_domain()
@@ -419,6 +453,7 @@ def create(self, validated_data):
419453

420454
class Meta:
421455
fields = core_serializers.SingleArtifactContentUploadSerializer.Meta.fields + (
456+
"artifact",
422457
"author",
423458
"author_email",
424459
"description",
@@ -514,6 +549,12 @@ def validate(self, data):
514549
data["provenance"] = self.handle_attestations(
515550
filename, data["sha256"], attestations, offline=True
516551
)
552+
# Create metadata artifact for wheel files
553+
if filename.endswith(".whl"):
554+
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
555+
data["metadata_artifact"] = metadata_artifact
556+
data["metadata_sha256"] = metadata_artifact.sha256
557+
517558
return data
518559

519560
class Meta(PythonPackageContentSerializer.Meta):

pulp_python/app/tasks/repair.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
9595
progress_report.save()
9696
with progress_report:
9797
for package in progress_report.iter(immediate_content.iterator(chunk_size=BULK_SIZE)):
98-
new_data = artifact_to_python_content_data(
99-
package.filename, package._artifacts.get(), domain
98+
# Get the main artifact
99+
main_artifact = (
100+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
101+
.first()
102+
.artifact
100103
)
104+
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
101105
total_repaired += update_package_if_needed(
102106
package, new_data, batch, set_of_update_fields
103107
)
@@ -113,7 +117,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
113117
grouped_by_url = defaultdict(list)
114118

115119
for package in group_set:
116-
for ra in package.contentartifact_set.get().remoteartifact_set.all():
120+
for ra in (
121+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
122+
.first()
123+
.remoteartifact_set.all()
124+
):
117125
grouped_by_url[ra.remote.url].append((package, ra))
118126

119127
# Prioritize the URL that can serve the most packages

pulp_python/app/tasks/sync.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,11 +229,15 @@ async def create_content(self, pkg):
229229
create a Content Unit to put into the pipeline
230230
"""
231231
declared_contents = {}
232+
page = await aget_remote_simple_page(pkg.name, self.remote)
233+
upstream_pkgs = {pkg.filename: pkg for pkg in page.packages}
234+
232235
for version, dists in pkg.releases.items():
233236
for package in dists:
234237
entry = parse_metadata(pkg.info, version, package)
235238
url = entry.pop("url")
236239
size = package["size"] or None
240+
d_artifacts = []
237241

238242
artifact = Artifact(sha256=entry["sha256"], size=size)
239243
package = PythonPackageContent(**entry)
@@ -245,11 +249,28 @@ async def create_content(self, pkg):
245249
remote=self.remote,
246250
deferred_download=self.deferred_download,
247251
)
248-
dc = DeclarativeContent(content=package, d_artifacts=[da])
252+
d_artifacts.append(da)
253+
254+
if upstream_pkg := upstream_pkgs.get(entry["filename"]):
255+
if upstream_pkg.has_metadata:
256+
url = upstream_pkg.metadata_url
257+
md_sha256 = upstream_pkg.metadata_digests.get("sha256")
258+
artifact = Artifact(sha256=md_sha256)
259+
260+
metadata_artifact = DeclarativeArtifact(
261+
artifact=artifact,
262+
url=url,
263+
relative_path=f"{entry['filename']}.metadata",
264+
remote=self.remote,
265+
deferred_download=self.deferred_download,
266+
)
267+
d_artifacts.append(metadata_artifact)
268+
269+
dc = DeclarativeContent(content=package, d_artifacts=d_artifacts)
249270
declared_contents[entry["filename"]] = dc
250271
await self.python_stage.put(dc)
251272

252-
if pkg.releases and (page := await aget_remote_simple_page(pkg.name, self.remote)):
273+
if pkg.releases and page:
253274
if self.remote.provenance:
254275
await self.sync_provenance(page, declared_contents)
255276

pulp_python/app/tasks/upload.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
Provenance,
1616
verify_provenance,
1717
)
18-
from pulp_python.app.utils import artifact_to_python_content_data
18+
from pulp_python.app.utils import artifact_to_metadata_artifact, artifact_to_python_content_data
1919

2020

2121
def upload(artifact_sha256, filename, attestations=None, repository_pk=None):
@@ -97,6 +97,11 @@ def create_content(artifact_sha256, filename, domain):
9797
def create():
9898
content = PythonPackageContent.objects.create(**data)
9999
ContentArtifact.objects.create(artifact=artifact, content=content, relative_path=filename)
100+
101+
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
102+
ContentArtifact.objects.create(
103+
artifact=metadata_artifact, content=content, relative_path=f"{filename}.metadata"
104+
)
100105
return content
101106

102107
new_content = create()

pulp_python/app/utils.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import hashlib
2+
import logging
23
import pkginfo
34
import re
45
import shutil
@@ -8,14 +9,19 @@
89
from aiohttp.client_exceptions import ClientError
910
from collections import defaultdict
1011
from django.conf import settings
12+
from django.db.utils import IntegrityError
1113
from django.utils import timezone
1214
from jinja2 import Template
1315
from packaging.utils import canonicalize_name
1416
from packaging.requirements import Requirement
1517
from packaging.version import parse, InvalidVersion
1618
from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage
17-
from pulpcore.plugin.models import Remote
19+
from pulpcore.plugin.models import Artifact, Remote
1820
from pulpcore.plugin.exceptions import TimeoutException
21+
from pulpcore.plugin.util import get_domain
22+
23+
24+
log = logging.getLogger(__name__)
1925

2026

2127
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
@@ -206,25 +212,34 @@ def get_project_metadata_from_file(filename):
206212
return metadata
207213

208214

209-
def compute_metadata_sha256(filename: str) -> str | None:
215+
def extract_wheel_metadata(filename: str) -> bytes | None:
210216
"""
211-
Compute SHA256 hash of the metadata file from a Python package.
217+
Extract the metadata file content from a wheel file.
212218
213-
Returns SHA256 hash or None if metadata cannot be extracted.
219+
Returns the raw metadata content as bytes or None if metadata cannot be extracted.
214220
"""
215221
if not filename.endswith(".whl"):
216222
return None
217223
try:
218224
with zipfile.ZipFile(filename, "r") as f:
219225
for file_path in f.namelist():
220226
if file_path.endswith(".dist-info/METADATA"):
221-
metadata_content = f.read(file_path)
222-
return hashlib.sha256(metadata_content).hexdigest()
223-
except (zipfile.BadZipFile, KeyError, OSError):
224-
pass
227+
return f.read(file_path)
228+
except (zipfile.BadZipFile, KeyError, OSError) as e:
229+
log.warning(f"Failed to extract metadata file from {filename}: {e}")
225230
return None
226231

227232

233+
def compute_metadata_sha256(filename: str) -> str | None:
234+
"""
235+
Compute SHA256 hash of the metadata file from a Python package.
236+
237+
Returns SHA256 hash or None if metadata cannot be extracted.
238+
"""
239+
metadata_content = extract_wheel_metadata(filename)
240+
return hashlib.sha256(metadata_content).hexdigest() if metadata_content else None
241+
242+
228243
def artifact_to_python_content_data(filename, artifact, domain=None):
229244
"""
230245
Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent.
@@ -233,6 +248,7 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
233248
# because pkginfo validates that the filename has a valid extension before
234249
# reading it
235250
with tempfile.NamedTemporaryFile("wb", dir=".", suffix=filename) as temp_file:
251+
artifact.file.seek(0)
236252
shutil.copyfileobj(artifact.file, temp_file)
237253
temp_file.flush()
238254
metadata = get_project_metadata_from_file(temp_file.name)
@@ -245,6 +261,35 @@ def artifact_to_python_content_data(filename, artifact, domain=None):
245261
return data
246262

247263

264+
def artifact_to_metadata_artifact(filename: str, artifact: Artifact) -> Artifact | None:
265+
"""
266+
Creates artifact for metadata from the provided wheel artifact.
267+
"""
268+
if not filename.endswith(".whl"):
269+
return None
270+
271+
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir_name:
272+
with tempfile.NamedTemporaryFile(dir=temp_dir_name, suffix=filename) as temp_file:
273+
artifact.file.seek(0)
274+
shutil.copyfileobj(artifact.file, temp_file)
275+
temp_file.flush()
276+
metadata_content = extract_wheel_metadata(temp_file.name)
277+
if not metadata_content:
278+
return None
279+
280+
with tempfile.NamedTemporaryFile(dir=temp_dir_name, suffix=".metadata") as temp_md:
281+
temp_md.write(metadata_content)
282+
temp_md.flush()
283+
metadata_artifact = Artifact.init_and_validate(temp_md.name)
284+
try:
285+
metadata_artifact.save()
286+
except IntegrityError:
287+
metadata_artifact = Artifact.objects.get(
288+
sha256=metadata_artifact.sha256, pulp_domain=get_domain()
289+
)
290+
return metadata_artifact
291+
292+
248293
def fetch_json_release_metadata(name: str, version: str, remotes: set[Remote]) -> dict:
249294
"""
250295
Fetches metadata for a specific release from PyPI's JSON API. A release can contain
@@ -408,7 +453,9 @@ def find_artifact():
408453
_art = models.RemoteArtifact.objects.filter(content_artifact=content_artifact).first()
409454
return _art
410455

411-
content_artifact = content.contentartifact_set.first()
456+
content_artifact = content.contentartifact_set.exclude(
457+
relative_path__endswith=".metadata"
458+
).first()
412459
artifact = find_artifact()
413460
origin = settings.CONTENT_ORIGIN or settings.PYPI_API_HOSTNAME or ""
414461
origin = origin.strip("/")

pulp_python/tests/functional/api/test_crud_content_unit.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
PYTHON_EGG_FILENAME,
1111
PYTHON_EGG_URL,
1212
PYTHON_SM_FIXTURE_CHECKSUMS,
13+
PYTHON_WHEEL_FILENAME,
14+
PYTHON_WHEEL_URL,
1315
)
16+
from pulp_python.tests.functional.utils import ensure_metadata
1417

1518

1619
def test_content_crud(
@@ -179,3 +182,22 @@ def test_upload_metadata_24_spec(python_content_factory):
179182
assert content.license_expression == "MIT"
180183
assert content.license_file == '["LICENSE"]'
181184
break
185+
186+
187+
@pytest.mark.parallel
188+
def test_package_creation_with_metadata(
189+
pulp_content_url,
190+
python_content_factory,
191+
python_distribution_factory,
192+
python_repo,
193+
):
194+
"""
195+
Test that the creation of a Python wheel package creates a metadata artifact.
196+
"""
197+
python_content_factory(
198+
repository=python_repo, relative_path=PYTHON_WHEEL_FILENAME, url=PYTHON_WHEEL_URL
199+
)
200+
distro = python_distribution_factory(repository=python_repo)
201+
202+
# Test that metadata is accessible
203+
ensure_metadata(pulp_content_url, distro.base_path, PYTHON_WHEEL_FILENAME)

0 commit comments

Comments
 (0)