Skip to content

Commit 2a3742e

Browse files
authored
Merge pull request #1037 from gerrod3/prov-sync
Add provenance syncing
2 parents dd51ffd + 1083e9c commit 2a3742e

File tree

7 files changed

+110
-22
lines changed

7 files changed

+110
-22
lines changed

pulp_python/app/migrations/0018_packageprovenance.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Generated by Django 4.2.26 on 2025-11-10 09:11
1+
# Generated by Django 4.2.26 on 2025-12-01 19:49
22

33
from django.db import migrations, models
44
import django.db.models.deletion
@@ -51,4 +51,9 @@ class Migration(migrations.Migration):
5151
},
5252
bases=("core.content",),
5353
),
54+
migrations.AddField(
55+
model_name="pythonremote",
56+
name="provenance",
57+
field=models.BooleanField(default=False),
58+
),
5459
]

pulp_python/app/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ class PythonRemote(Remote, AutoAddObjPermsMixin):
314314
exclude_platforms = ArrayField(
315315
models.CharField(max_length=10, blank=True), choices=PLATFORMS, default=list
316316
)
317+
provenance = models.BooleanField(default=False)
317318

318319
def get_remote_artifact_url(self, relative_path=None, request=None):
319320
"""Get url for remote_artifact"""

pulp_python/app/pypi/views.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
import json
21
import logging
32

4-
from aiohttp.client_exceptions import ClientError
53
from rest_framework.viewsets import ViewSet
64
from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer, TemplateHTMLRenderer
75
from rest_framework.response import Response
@@ -27,12 +25,10 @@
2725
from packaging.utils import canonicalize_name
2826
from urllib.parse import urljoin, urlparse, urlunsplit
2927
from pathlib import PurePath
30-
from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage
3128

3229
from pulpcore.plugin.viewsets import OperationPostponedResponse
3330
from pulpcore.plugin.tasking import dispatch
3431
from pulpcore.plugin.util import get_domain, get_url
35-
from pulpcore.plugin.exceptions import TimeoutException
3632
from pulp_python.app.models import (
3733
PythonDistribution,
3834
PythonPackageContent,
@@ -54,6 +50,7 @@
5450
PYPI_LAST_SERIAL,
5551
PYPI_SERIAL_CONSTANT,
5652
get_remote_package_filter,
53+
get_remote_simple_page,
5754
)
5855

5956
from pulp_python.app import tasks
@@ -332,20 +329,10 @@ def parse_package(release_package):
332329
if not rfilter.filter_project(package):
333330
return {}
334331

335-
url = remote.get_remote_artifact_url(f"simple/{package}/")
336-
remote.headers = remote.headers or []
337-
remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED})
338-
downloader = remote.get_downloader(url=url, max_retries=1)
339-
try:
340-
d = downloader.fetch()
341-
except (ClientError, TimeoutException):
332+
page = get_remote_simple_page(package, remote)
333+
if not page:
342334
log.info(f"Failed to fetch {package} simple page from {remote.url}")
343335
return {}
344-
345-
if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON:
346-
page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url)
347-
else:
348-
page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url)
349336
return {
350337
p.filename: parse_package(p)
351338
for p in page.packages

pulp_python/app/serializers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,6 +595,11 @@ class PythonRemoteSerializer(core_serializers.RemoteSerializer):
595595
choices=python_models.PLATFORMS,
596596
default=list,
597597
)
598+
provenance = serializers.BooleanField(
599+
required=False,
600+
help_text=_("Whether to sync available provenances for Python packages."),
601+
default=False,
602+
)
598603

599604
def validate_includes(self, value):
600605
"""Validates the includes"""
@@ -626,6 +631,7 @@ class Meta:
626631
"package_types",
627632
"keep_latest_packages",
628633
"exclude_platforms",
634+
"provenance",
629635
)
630636
model = python_models.PythonRemote
631637

pulp_python/app/tasks/sync.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import asyncio
23

34
from aiohttp import ClientResponseError, ClientError
45
from lxml.etree import LxmlError
@@ -19,9 +20,11 @@
1920
from pulp_python.app.models import (
2021
PythonPackageContent,
2122
PythonRemote,
23+
PackageProvenance,
2224
)
23-
from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL
25+
from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL, aget_remote_simple_page
2426
from pypi_simple import IndexPage
27+
from pypi_attestations import Provenance
2528

2629
from bandersnatch.mirror import Mirror
2730
from bandersnatch.master import Master
@@ -163,6 +166,7 @@ def __init__(self, serial, master, workers, deferred_download, python_stage, pro
163166
self.python_stage = python_stage
164167
self.progress_report = progress_report
165168
self.deferred_download = deferred_download
169+
self.remote = self.python_stage.remote
166170

167171
async def determine_packages_to_sync(self):
168172
"""
@@ -194,8 +198,8 @@ async def determine_packages_to_sync(self):
194198
continue
195199
else:
196200
logger.info("Failed to get package list using XMLRPC, trying parse simple page.")
197-
url = urljoin(self.python_stage.remote.url, "simple/")
198-
downloader = self.python_stage.remote.get_downloader(url=url)
201+
url = urljoin(self.remote.url, "simple/")
202+
downloader = self.remote.get_downloader(url=url)
199203
result = await downloader.run()
200204
with open(result.path) as f:
201205
index = IndexPage.from_html(f.read())
@@ -224,6 +228,7 @@ async def create_content(self, pkg):
224228
Take the filtered package, separate into releases and
225229
create a Content Unit to put into the pipeline
226230
"""
231+
declared_contents = {}
227232
for version, dists in pkg.releases.items():
228233
for package in dists:
229234
entry = parse_metadata(pkg.info, version, package)
@@ -237,13 +242,44 @@ async def create_content(self, pkg):
237242
artifact=artifact,
238243
url=url,
239244
relative_path=entry["filename"],
240-
remote=self.python_stage.remote,
245+
remote=self.remote,
241246
deferred_download=self.deferred_download,
242247
)
243248
dc = DeclarativeContent(content=package, d_artifacts=[da])
244-
249+
declared_contents[entry["filename"]] = dc
245250
await self.python_stage.put(dc)
246251

252+
if pkg.releases and (page := await aget_remote_simple_page(pkg.name, self.remote)):
253+
if self.remote.provenance:
254+
await self.sync_provenance(page, declared_contents)
255+
256+
async def sync_provenance(self, page, declared_contents):
257+
"""Sync the provenance for the package"""
258+
259+
async def _create_provenance(filename, provenance_url):
260+
downloader = self.remote.get_downloader(
261+
url=provenance_url, silence_errors_for_response_codes={404}
262+
)
263+
try:
264+
result = await downloader.run()
265+
except FileNotFoundError:
266+
pass
267+
else:
268+
package_content = await declared_contents[filename].resolution()
269+
with open(result.path) as f:
270+
provenance = Provenance.model_validate_json(f.read())
271+
prov_content = PackageProvenance(
272+
package=package_content, provenance=provenance.model_dump(mode="json")
273+
)
274+
prov_content.set_sha256_hook()
275+
await self.python_stage.put(DeclarativeContent(content=prov_content))
276+
277+
tasks = []
278+
for package in page.packages:
279+
if package.filename in declared_contents and package.provenance_url:
280+
tasks.append(_create_provenance(package.filename, package.provenance_url))
281+
await asyncio.gather(*tasks)
282+
247283
def finalize_sync(self, *args, **kwargs):
248284
"""No work to be done currently"""
249285
pass

pulp_python/app/utils.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,26 @@
55
import tempfile
66
import zipfile
77
import json
8+
from aiohttp.client_exceptions import ClientError
89
from collections import defaultdict
910
from django.conf import settings
1011
from django.utils import timezone
1112
from jinja2 import Template
1213
from packaging.utils import canonicalize_name
1314
from packaging.requirements import Requirement
1415
from packaging.version import parse, InvalidVersion
16+
from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage
1517
from pulpcore.plugin.models import Remote
18+
from pulpcore.plugin.exceptions import TimeoutException
1619

1720

1821
PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
1922
"""TODO This serial constant is temporary until Python repositories implements serials"""
2023
PYPI_SERIAL_CONSTANT = 1000000000
2124

2225
SIMPLE_API_VERSION = "1.1"
26+
PYPI_SIMPLE_V1_HTML = "application/vnd.pypi.simple.v1+html"
27+
PYPI_SIMPLE_V1_JSON = "application/vnd.pypi.simple.v1+json"
2328

2429
simple_index_template = """<!DOCTYPE html>
2530
<html>
@@ -576,3 +581,39 @@ def get_remote_package_filter(remote):
576581
rfilter = PackageIncludeFilter(remote)
577582
_remote_filters[remote.pulp_id] = (remote.pulp_last_updated, rfilter)
578583
return rfilter
584+
585+
586+
def get_remote_simple_page(package, remote, max_retries=1):
587+
"""Gets the simple page for a package from a remote."""
588+
url = remote.get_remote_artifact_url(f"simple/{package}/")
589+
remote.headers = remote.headers or []
590+
remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED})
591+
downloader = remote.get_downloader(url=url, max_retries=max_retries)
592+
try:
593+
d = downloader.fetch()
594+
except (ClientError, TimeoutException):
595+
return None
596+
597+
if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON:
598+
page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url)
599+
else:
600+
page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url)
601+
return page
602+
603+
604+
async def aget_remote_simple_page(package, remote, max_retries=1):
605+
"""Gets the simple page for a package from a remote."""
606+
url = remote.get_remote_artifact_url(f"simple/{package}/")
607+
remote.headers = remote.headers or []
608+
remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED})
609+
downloader = remote.get_downloader(url=url, max_retries=max_retries)
610+
try:
611+
d = await downloader.run()
612+
except (ClientError, TimeoutException):
613+
return None
614+
615+
if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON:
616+
page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url)
617+
else:
618+
page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url)
619+
return page

pulp_python/tests/functional/api/test_sync.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,15 @@ def test_proxy_auth_sync(
324324

325325
content = python_bindings.ContentPackagesApi.list(repository_version=repo.latest_version_href)
326326
assert content.count == 2
327+
328+
329+
@pytest.mark.parallel
330+
def test_sync_provenance(python_repo_with_sync, python_remote_factory, python_content_summary):
331+
"""Test syncing with provenance."""
332+
remote = python_remote_factory(provenance=True, includes=["twine==6.0.0"])
333+
repo = python_repo_with_sync(remote)
334+
assert repo.latest_version_href[-2] == "1"
335+
336+
summary = python_content_summary(repository_version=repo.latest_version_href)
337+
assert summary.present["python.python"]["count"] == 2
338+
assert summary.present["python.provenance"]["count"] == 2

0 commit comments

Comments
 (0)