From 1083e9c41beda1cc5f3f1ecbeb0eb86209a5e208 Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Mon, 1 Dec 2025 14:57:12 -0500 Subject: [PATCH] Add provenance syncing --- .../app/migrations/0018_packageprovenance.py | 7 ++- pulp_python/app/models.py | 1 + pulp_python/app/pypi/views.py | 19 ++------ pulp_python/app/serializers.py | 6 +++ pulp_python/app/tasks/sync.py | 46 +++++++++++++++++-- pulp_python/app/utils.py | 41 +++++++++++++++++ pulp_python/tests/functional/api/test_sync.py | 12 +++++ 7 files changed, 110 insertions(+), 22 deletions(-) diff --git a/pulp_python/app/migrations/0018_packageprovenance.py b/pulp_python/app/migrations/0018_packageprovenance.py index 2c172f70..2e7012cf 100644 --- a/pulp_python/app/migrations/0018_packageprovenance.py +++ b/pulp_python/app/migrations/0018_packageprovenance.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.26 on 2025-11-10 09:11 +# Generated by Django 4.2.26 on 2025-12-01 19:49 from django.db import migrations, models import django.db.models.deletion @@ -51,4 +51,9 @@ class Migration(migrations.Migration): }, bases=("core.content",), ), + migrations.AddField( + model_name="pythonremote", + name="provenance", + field=models.BooleanField(default=False), + ), ] diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index 4361489d..b11b8a32 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -314,6 +314,7 @@ class PythonRemote(Remote, AutoAddObjPermsMixin): exclude_platforms = ArrayField( models.CharField(max_length=10, blank=True), choices=PLATFORMS, default=list ) + provenance = models.BooleanField(default=False) def get_remote_artifact_url(self, relative_path=None, request=None): """Get url for remote_artifact""" diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py index e2d76ec9..73faea37 100644 --- a/pulp_python/app/pypi/views.py +++ b/pulp_python/app/pypi/views.py @@ -1,7 +1,5 @@ -import json import logging -from aiohttp.client_exceptions import ClientError from rest_framework.viewsets import ViewSet from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer, TemplateHTMLRenderer from rest_framework.response import Response @@ -27,12 +25,10 @@ from packaging.utils import canonicalize_name from urllib.parse import urljoin, urlparse, urlunsplit from pathlib import PurePath -from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage from pulpcore.plugin.viewsets import OperationPostponedResponse from pulpcore.plugin.tasking import dispatch from pulpcore.plugin.util import get_domain, get_url -from pulpcore.plugin.exceptions import TimeoutException from pulp_python.app.models import ( PythonDistribution, PythonPackageContent, @@ -54,6 +50,7 @@ PYPI_LAST_SERIAL, PYPI_SERIAL_CONSTANT, get_remote_package_filter, + get_remote_simple_page, ) from pulp_python.app import tasks @@ -332,20 +329,10 @@ def parse_package(release_package): if not rfilter.filter_project(package): return {} - url = remote.get_remote_artifact_url(f"simple/{package}/") - remote.headers = remote.headers or [] - remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED}) - downloader = remote.get_downloader(url=url, max_retries=1) - try: - d = downloader.fetch() - except (ClientError, TimeoutException): + page = get_remote_simple_page(package, remote) + if not page: log.info(f"Failed to fetch {package} simple page from {remote.url}") return {} - - if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON: - page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url) - else: - page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url) return { p.filename: parse_package(p) for p in page.packages diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py index a16310fe..a59e9932 100644 --- a/pulp_python/app/serializers.py +++ b/pulp_python/app/serializers.py @@ -595,6 +595,11 @@ class PythonRemoteSerializer(core_serializers.RemoteSerializer): choices=python_models.PLATFORMS, default=list, ) + provenance = serializers.BooleanField( + required=False, + help_text=_("Whether to sync available provenances for Python packages."), + default=False, + ) def validate_includes(self, value): """Validates the includes""" @@ -626,6 +631,7 @@ class Meta: "package_types", "keep_latest_packages", "exclude_platforms", + "provenance", ) model = python_models.PythonRemote diff --git a/pulp_python/app/tasks/sync.py b/pulp_python/app/tasks/sync.py index e3ecb108..d7058e8e 100644 --- a/pulp_python/app/tasks/sync.py +++ b/pulp_python/app/tasks/sync.py @@ -1,4 +1,5 @@ import logging +import asyncio from aiohttp import ClientResponseError, ClientError from lxml.etree import LxmlError @@ -19,9 +20,11 @@ from pulp_python.app.models import ( PythonPackageContent, PythonRemote, + PackageProvenance, ) -from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL +from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL, aget_remote_simple_page from pypi_simple import IndexPage +from pypi_attestations import Provenance from bandersnatch.mirror import Mirror from bandersnatch.master import Master @@ -163,6 +166,7 @@ def __init__(self, serial, master, workers, deferred_download, python_stage, pro self.python_stage = python_stage self.progress_report = progress_report self.deferred_download = deferred_download + self.remote = self.python_stage.remote async def determine_packages_to_sync(self): """ @@ -194,8 +198,8 @@ async def determine_packages_to_sync(self): continue else: logger.info("Failed to get package list using XMLRPC, trying parse simple page.") - url = urljoin(self.python_stage.remote.url, "simple/") - downloader = self.python_stage.remote.get_downloader(url=url) + url = urljoin(self.remote.url, "simple/") + downloader = self.remote.get_downloader(url=url) result = await downloader.run() with open(result.path) as f: index = IndexPage.from_html(f.read()) @@ -224,6 +228,7 @@ async def create_content(self, pkg): Take the filtered package, separate into releases and create a Content Unit to put into the pipeline """ + declared_contents = {} for version, dists in pkg.releases.items(): for package in dists: entry = parse_metadata(pkg.info, version, package) @@ -237,13 +242,44 @@ async def create_content(self, pkg): artifact=artifact, url=url, relative_path=entry["filename"], - remote=self.python_stage.remote, + remote=self.remote, deferred_download=self.deferred_download, ) dc = DeclarativeContent(content=package, d_artifacts=[da]) - + declared_contents[entry["filename"]] = dc await self.python_stage.put(dc) + if pkg.releases and (page := await aget_remote_simple_page(pkg.name, self.remote)): + if self.remote.provenance: + await self.sync_provenance(page, declared_contents) + + async def sync_provenance(self, page, declared_contents): + """Sync the provenance for the package""" + + async def _create_provenance(filename, provenance_url): + downloader = self.remote.get_downloader( + url=provenance_url, silence_errors_for_response_codes={404} + ) + try: + result = await downloader.run() + except FileNotFoundError: + pass + else: + package_content = await declared_contents[filename].resolution() + with open(result.path) as f: + provenance = Provenance.model_validate_json(f.read()) + prov_content = PackageProvenance( + package=package_content, provenance=provenance.model_dump(mode="json") + ) + prov_content.set_sha256_hook() + await self.python_stage.put(DeclarativeContent(content=prov_content)) + + tasks = [] + for package in page.packages: + if package.filename in declared_contents and package.provenance_url: + tasks.append(_create_provenance(package.filename, package.provenance_url)) + await asyncio.gather(*tasks) + def finalize_sync(self, *args, **kwargs): """No work to be done currently""" pass diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index cb918001..50aa9cae 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -5,6 +5,7 @@ import tempfile import zipfile import json +from aiohttp.client_exceptions import ClientError from collections import defaultdict from django.conf import settings from django.utils import timezone @@ -12,7 +13,9 @@ from packaging.utils import canonicalize_name from packaging.requirements import Requirement from packaging.version import parse, InvalidVersion +from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage from pulpcore.plugin.models import Remote +from pulpcore.plugin.exceptions import TimeoutException PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL" @@ -20,6 +23,8 @@ PYPI_SERIAL_CONSTANT = 1000000000 SIMPLE_API_VERSION = "1.1" +PYPI_SIMPLE_V1_HTML = "application/vnd.pypi.simple.v1+html" +PYPI_SIMPLE_V1_JSON = "application/vnd.pypi.simple.v1+json" simple_index_template = """ @@ -576,3 +581,39 @@ def get_remote_package_filter(remote): rfilter = PackageIncludeFilter(remote) _remote_filters[remote.pulp_id] = (remote.pulp_last_updated, rfilter) return rfilter + + +def get_remote_simple_page(package, remote, max_retries=1): + """Gets the simple page for a package from a remote.""" + url = remote.get_remote_artifact_url(f"simple/{package}/") + remote.headers = remote.headers or [] + remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED}) + downloader = remote.get_downloader(url=url, max_retries=max_retries) + try: + d = downloader.fetch() + except (ClientError, TimeoutException): + return None + + if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON: + page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url) + else: + page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url) + return page + + +async def aget_remote_simple_page(package, remote, max_retries=1): + """Gets the simple page for a package from a remote.""" + url = remote.get_remote_artifact_url(f"simple/{package}/") + remote.headers = remote.headers or [] + remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED}) + downloader = remote.get_downloader(url=url, max_retries=max_retries) + try: + d = await downloader.run() + except (ClientError, TimeoutException): + return None + + if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON: + page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url) + else: + page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url) + return page diff --git a/pulp_python/tests/functional/api/test_sync.py b/pulp_python/tests/functional/api/test_sync.py index 5069b108..c8030b8a 100644 --- a/pulp_python/tests/functional/api/test_sync.py +++ b/pulp_python/tests/functional/api/test_sync.py @@ -324,3 +324,15 @@ def test_proxy_auth_sync( content = python_bindings.ContentPackagesApi.list(repository_version=repo.latest_version_href) assert content.count == 2 + + +@pytest.mark.parallel +def test_sync_provenance(python_repo_with_sync, python_remote_factory, python_content_summary): + """Test syncing with provenance.""" + remote = python_remote_factory(provenance=True, includes=["twine==6.0.0"]) + repo = python_repo_with_sync(remote) + assert repo.latest_version_href[-2] == "1" + + summary = python_content_summary(repository_version=repo.latest_version_href) + assert summary.present["python.python"]["count"] == 2 + assert summary.present["python.provenance"]["count"] == 2