Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pulp_python/app/migrations/0018_packageprovenance.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 4.2.26 on 2025-11-10 09:11
# Generated by Django 4.2.26 on 2025-12-01 19:49

from django.db import migrations, models
import django.db.models.deletion
Expand Down Expand Up @@ -51,4 +51,9 @@ class Migration(migrations.Migration):
},
bases=("core.content",),
),
migrations.AddField(
model_name="pythonremote",
name="provenance",
field=models.BooleanField(default=False),
),
]
1 change: 1 addition & 0 deletions pulp_python/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ class PythonRemote(Remote, AutoAddObjPermsMixin):
exclude_platforms = ArrayField(
models.CharField(max_length=10, blank=True), choices=PLATFORMS, default=list
)
provenance = models.BooleanField(default=False)

def get_remote_artifact_url(self, relative_path=None, request=None):
"""Get url for remote_artifact"""
Expand Down
19 changes: 3 additions & 16 deletions pulp_python/app/pypi/views.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import json
import logging

from aiohttp.client_exceptions import ClientError
from rest_framework.viewsets import ViewSet
from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer, TemplateHTMLRenderer
from rest_framework.response import Response
Expand All @@ -27,12 +25,10 @@
from packaging.utils import canonicalize_name
from urllib.parse import urljoin, urlparse, urlunsplit
from pathlib import PurePath
from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage

from pulpcore.plugin.viewsets import OperationPostponedResponse
from pulpcore.plugin.tasking import dispatch
from pulpcore.plugin.util import get_domain, get_url
from pulpcore.plugin.exceptions import TimeoutException
from pulp_python.app.models import (
PythonDistribution,
PythonPackageContent,
Expand All @@ -54,6 +50,7 @@
PYPI_LAST_SERIAL,
PYPI_SERIAL_CONSTANT,
get_remote_package_filter,
get_remote_simple_page,
)

from pulp_python.app import tasks
Expand Down Expand Up @@ -332,20 +329,10 @@ def parse_package(release_package):
if not rfilter.filter_project(package):
return {}

url = remote.get_remote_artifact_url(f"simple/{package}/")
remote.headers = remote.headers or []
remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED})
downloader = remote.get_downloader(url=url, max_retries=1)
try:
d = downloader.fetch()
except (ClientError, TimeoutException):
page = get_remote_simple_page(package, remote)
if not page:
log.info(f"Failed to fetch {package} simple page from {remote.url}")
return {}

if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON:
page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url)
else:
page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url)
return {
p.filename: parse_package(p)
for p in page.packages
Expand Down
6 changes: 6 additions & 0 deletions pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,11 @@ class PythonRemoteSerializer(core_serializers.RemoteSerializer):
choices=python_models.PLATFORMS,
default=list,
)
provenance = serializers.BooleanField(
required=False,
help_text=_("Whether to sync available provenances for Python packages."),
default=False,
)

def validate_includes(self, value):
"""Validates the includes"""
Expand Down Expand Up @@ -626,6 +631,7 @@ class Meta:
"package_types",
"keep_latest_packages",
"exclude_platforms",
"provenance",
)
model = python_models.PythonRemote

Expand Down
46 changes: 41 additions & 5 deletions pulp_python/app/tasks/sync.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import asyncio

from aiohttp import ClientResponseError, ClientError
from lxml.etree import LxmlError
Expand All @@ -19,9 +20,11 @@
from pulp_python.app.models import (
PythonPackageContent,
PythonRemote,
PackageProvenance,
)
from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL
from pulp_python.app.utils import parse_metadata, PYPI_LAST_SERIAL, aget_remote_simple_page
from pypi_simple import IndexPage
from pypi_attestations import Provenance

from bandersnatch.mirror import Mirror
from bandersnatch.master import Master
Expand Down Expand Up @@ -163,6 +166,7 @@ def __init__(self, serial, master, workers, deferred_download, python_stage, pro
self.python_stage = python_stage
self.progress_report = progress_report
self.deferred_download = deferred_download
self.remote = self.python_stage.remote

async def determine_packages_to_sync(self):
"""
Expand Down Expand Up @@ -194,8 +198,8 @@ async def determine_packages_to_sync(self):
continue
else:
logger.info("Failed to get package list using XMLRPC, trying parse simple page.")
url = urljoin(self.python_stage.remote.url, "simple/")
downloader = self.python_stage.remote.get_downloader(url=url)
url = urljoin(self.remote.url, "simple/")
downloader = self.remote.get_downloader(url=url)
result = await downloader.run()
with open(result.path) as f:
index = IndexPage.from_html(f.read())
Expand Down Expand Up @@ -224,6 +228,7 @@ async def create_content(self, pkg):
Take the filtered package, separate into releases and
create a Content Unit to put into the pipeline
"""
declared_contents = {}
for version, dists in pkg.releases.items():
for package in dists:
entry = parse_metadata(pkg.info, version, package)
Expand All @@ -237,13 +242,44 @@ async def create_content(self, pkg):
artifact=artifact,
url=url,
relative_path=entry["filename"],
remote=self.python_stage.remote,
remote=self.remote,
deferred_download=self.deferred_download,
)
dc = DeclarativeContent(content=package, d_artifacts=[da])

declared_contents[entry["filename"]] = dc
await self.python_stage.put(dc)

if pkg.releases and (page := await aget_remote_simple_page(pkg.name, self.remote)):
if self.remote.provenance:
await self.sync_provenance(page, declared_contents)

async def sync_provenance(self, page, declared_contents):
"""Sync the provenance for the package"""

async def _create_provenance(filename, provenance_url):
downloader = self.remote.get_downloader(
url=provenance_url, silence_errors_for_response_codes={404}
)
try:
result = await downloader.run()
except FileNotFoundError:
pass
else:
package_content = await declared_contents[filename].resolution()
with open(result.path) as f:
provenance = Provenance.model_validate_json(f.read())
prov_content = PackageProvenance(
package=package_content, provenance=provenance.model_dump(mode="json")
)
prov_content.set_sha256_hook()
await self.python_stage.put(DeclarativeContent(content=prov_content))

tasks = []
for package in page.packages:
if package.filename in declared_contents and package.provenance_url:
tasks.append(_create_provenance(package.filename, package.provenance_url))
await asyncio.gather(*tasks)

def finalize_sync(self, *args, **kwargs):
"""No work to be done currently"""
pass
Expand Down
41 changes: 41 additions & 0 deletions pulp_python/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,26 @@
import tempfile
import zipfile
import json
from aiohttp.client_exceptions import ClientError
from collections import defaultdict
from django.conf import settings
from django.utils import timezone
from jinja2 import Template
from packaging.utils import canonicalize_name
from packaging.requirements import Requirement
from packaging.version import parse, InvalidVersion
from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage
from pulpcore.plugin.models import Remote
from pulpcore.plugin.exceptions import TimeoutException


PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL"
"""TODO This serial constant is temporary until Python repositories implements serials"""
PYPI_SERIAL_CONSTANT = 1000000000

SIMPLE_API_VERSION = "1.1"
PYPI_SIMPLE_V1_HTML = "application/vnd.pypi.simple.v1+html"
PYPI_SIMPLE_V1_JSON = "application/vnd.pypi.simple.v1+json"

simple_index_template = """<!DOCTYPE html>
<html>
Expand Down Expand Up @@ -576,3 +581,39 @@ def get_remote_package_filter(remote):
rfilter = PackageIncludeFilter(remote)
_remote_filters[remote.pulp_id] = (remote.pulp_last_updated, rfilter)
return rfilter


def get_remote_simple_page(package, remote, max_retries=1):
"""Gets the simple page for a package from a remote."""
url = remote.get_remote_artifact_url(f"simple/{package}/")
remote.headers = remote.headers or []
remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED})
downloader = remote.get_downloader(url=url, max_retries=max_retries)
try:
d = downloader.fetch()
except (ClientError, TimeoutException):
return None

if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON:
page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url)
else:
page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url)
return page


async def aget_remote_simple_page(package, remote, max_retries=1):
"""Gets the simple page for a package from a remote."""
url = remote.get_remote_artifact_url(f"simple/{package}/")
remote.headers = remote.headers or []
remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED})
downloader = remote.get_downloader(url=url, max_retries=max_retries)
try:
d = await downloader.run()
except (ClientError, TimeoutException):
return None

if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON:
page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url)
else:
page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url)
return page
12 changes: 12 additions & 0 deletions pulp_python/tests/functional/api/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,15 @@ def test_proxy_auth_sync(

content = python_bindings.ContentPackagesApi.list(repository_version=repo.latest_version_href)
assert content.count == 2


@pytest.mark.parallel
def test_sync_provenance(python_repo_with_sync, python_remote_factory, python_content_summary):
"""Test syncing with provenance."""
remote = python_remote_factory(provenance=True, includes=["twine==6.0.0"])
repo = python_repo_with_sync(remote)
assert repo.latest_version_href[-2] == "1"

summary = python_content_summary(repository_version=repo.latest_version_href)
assert summary.present["python.python"]["count"] == 2
assert summary.present["python.provenance"]["count"] == 2