diff --git a/CHANGES/625.feature b/CHANGES/625.feature new file mode 100644 index 00000000..38928e60 --- /dev/null +++ b/CHANGES/625.feature @@ -0,0 +1 @@ +Added JSON-based Simple API (PEP 691). diff --git a/pulp_python/app/migrations/0016_pythonpackagecontent_metadata_sha256.py b/pulp_python/app/migrations/0016_pythonpackagecontent_metadata_sha256.py new file mode 100644 index 00000000..0dc68d31 --- /dev/null +++ b/pulp_python/app/migrations/0016_pythonpackagecontent_metadata_sha256.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.25 on 2025-11-04 07:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("python", "0015_alter_pythonpackagecontent_options"), + ] + + operations = [ + migrations.AddField( + model_name="pythonpackagecontent", + name="metadata_sha256", + field=models.CharField(max_length=64, null=True), + ), + ] diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index 3bd9d605..14059397 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -192,6 +192,8 @@ class PythonPackageContent(Content): packagetype = models.TextField(choices=PACKAGE_TYPES) python_version = models.TextField() sha256 = models.CharField(db_index=True, max_length=64) + metadata_sha256 = models.CharField(max_length=64, null=True) + # yanked and yanked_reason are not implemented because they are mutable # From pulpcore PROTECTED_FROM_RECLAIM = False diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py index bd8bc2af..c0b636bb 100644 --- a/pulp_python/app/pypi/views.py +++ b/pulp_python/app/pypi/views.py @@ -3,7 +3,9 @@ from aiohttp.client_exceptions import ClientError from rest_framework.viewsets import ViewSet +from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer, TemplateHTMLRenderer from rest_framework.response import Response +from rest_framework.exceptions import NotAcceptable from django.core.exceptions import ObjectDoesNotExist from django.shortcuts import redirect from datetime import datetime, timezone, timedelta @@ -43,7 +45,9 @@ ) from pulp_python.app.utils import ( write_simple_index, + write_simple_index_json, write_simple_detail, + write_simple_detail_json, python_content_to_json, PYPI_LAST_SERIAL, PYPI_SERIAL_CONSTANT, @@ -57,6 +61,17 @@ ORIGIN_HOST = settings.CONTENT_ORIGIN if settings.CONTENT_ORIGIN else settings.PYPI_API_HOSTNAME BASE_CONTENT_URL = urljoin(ORIGIN_HOST, settings.CONTENT_PATH_PREFIX) +PYPI_SIMPLE_V1_HTML = "application/vnd.pypi.simple.v1+html" +PYPI_SIMPLE_V1_JSON = "application/vnd.pypi.simple.v1+json" + + +class PyPISimpleHTMLRenderer(TemplateHTMLRenderer): + media_type = PYPI_SIMPLE_V1_HTML + + +class PyPISimpleJSONRenderer(JSONRenderer): + media_type = PYPI_SIMPLE_V1_JSON + class PyPIMixin: """Mixin to get index specific info.""" @@ -235,6 +250,25 @@ class SimpleView(PackageUploadMixin, ViewSet): ], } + def perform_content_negotiation(self, request, force=False): + """ + Uses standard content negotiation, defaulting to HTML if no acceptable renderer is found. + """ + try: + return super().perform_content_negotiation(request, force) + except NotAcceptable: + return TemplateHTMLRenderer(), TemplateHTMLRenderer.media_type # text/html + + def get_renderers(self): + """ + Uses custom renderers for PyPI Simple API endpoints, defaulting to standard ones. + """ + if self.action in ["list", "retrieve"]: + # Ordered by priority if multiple content types are present + return [TemplateHTMLRenderer(), PyPISimpleHTMLRenderer(), PyPISimpleJSONRenderer()] + else: + return [JSONRenderer(), BrowsableAPIRenderer()] + @extend_schema(summary="Get index simple page") def list(self, request, path): """Gets the simple api html page for the index.""" @@ -242,9 +276,18 @@ def list(self, request, path): if self.should_redirect(repo_version=repo_version): return redirect(urljoin(self.base_content_url, f"{path}/simple/")) names = content.order_by("name").values_list("name", flat=True).distinct().iterator() - return StreamingHttpResponse(write_simple_index(names, streamed=True)) + media_type = request.accepted_renderer.media_type + headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)} + + if media_type == PYPI_SIMPLE_V1_JSON: + index_data = write_simple_index_json(names) + return Response(index_data, headers=headers) + else: + index_data = write_simple_index(names, streamed=True) + kwargs = {"content_type": media_type, "headers": headers} + return StreamingHttpResponse(index_data, **kwargs) - def pull_through_package_simple(self, package, path, remote): + def pull_through_package_simple(self, package, path, remote, media_type): """Gets the package's simple page from remote.""" def parse_package(release_package): @@ -252,7 +295,13 @@ def parse_package(release_package): stripped_url = urlunsplit(chain(parsed[:3], ("", ""))) redirect_path = f"{path}/{release_package.filename}?redirect={stripped_url}" d_url = urljoin(self.base_content_url, redirect_path) - return release_package.filename, d_url, release_package.digests.get("sha256", "") + return { + "filename": release_package.filename, + "url": d_url, + "sha256": release_package.digests.get("sha256", ""), + "requires_python": release_package.requires_python, + "metadata_sha256": (release_package.metadata_digests or {}).get("sha256"), + } rfilter = get_remote_package_filter(remote) if not rfilter.filter_project(package): @@ -269,28 +318,40 @@ def parse_package(release_package): except TimeoutException: return HttpResponse(f"{remote.url} timed out while fetching {package}.", status=504) - if d.headers["content-type"] == "application/vnd.pypi.simple.v1+json": + if d.headers["content-type"] == PYPI_SIMPLE_V1_JSON: page = ProjectPage.from_json_data(json.load(open(d.path, "rb")), base_url=url) else: page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=url) packages = [ parse_package(p) for p in page.packages if rfilter.filter_release(package, p.version) ] - return HttpResponse(write_simple_detail(package, packages)) + headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)} + + if media_type == PYPI_SIMPLE_V1_JSON: + detail_data = write_simple_detail_json(package, packages) + return Response(detail_data, headers=headers) + else: + detail_data = write_simple_detail(package, packages) + kwargs = {"content_type": media_type, "headers": headers} + return HttpResponse(detail_data, **kwargs) @extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page") def retrieve(self, request, path, package): - """Retrieves the simple api html page for a package.""" + """Retrieves the simple api html/json page for a package.""" + media_type = request.accepted_renderer.media_type + repo_ver, content = self.get_rvc() # Should I redirect if the normalized name is different? normalized = canonicalize_name(package) if self.distribution.remote: - return self.pull_through_package_simple(normalized, path, self.distribution.remote) + return self.pull_through_package_simple( + normalized, path, self.distribution.remote, media_type + ) if self.should_redirect(repo_version=repo_ver): return redirect(urljoin(self.base_content_url, f"{path}/simple/{normalized}/")) packages = ( content.filter(name__normalize=normalized) - .values_list("filename", "sha256", "name") + .values_list("filename", "sha256", "name", "metadata_sha256", "requires_python") .iterator() ) try: @@ -300,8 +361,26 @@ def retrieve(self, request, path, package): else: packages = chain([present], packages) name = present[2] - releases = ((f, urljoin(self.base_content_url, f"{path}/{f}"), d) for f, d, _ in packages) - return StreamingHttpResponse(write_simple_detail(name, releases, streamed=True)) + releases = ( + { + "filename": filename, + "url": urljoin(self.base_content_url, f"{path}/{filename}"), + "sha256": sha256, + "metadata_sha256": metadata_sha256, + "requires_python": requires_python, + } + for filename, sha256, _, metadata_sha256, requires_python in packages + ) + media_type = request.accepted_renderer.media_type + headers = {"X-PyPI-Last-Serial": str(PYPI_SERIAL_CONSTANT)} + + if media_type == PYPI_SIMPLE_V1_JSON: + detail_data = write_simple_detail_json(name, releases) + return Response(detail_data, headers=headers) + else: + detail_data = write_simple_detail(name, releases, streamed=True) + kwargs = {"content_type": media_type, "headers": headers} + return StreamingHttpResponse(detail_data, **kwargs) @extend_schema( request=PackageUploadSerializer, diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py index d8387adf..2ba95ba4 100644 --- a/pulp_python/app/serializers.py +++ b/pulp_python/app/serializers.py @@ -281,6 +281,11 @@ class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploa default="", help_text=_("The SHA256 digest of this package."), ) + metadata_sha256 = serializers.CharField( + required=False, + allow_null=True, + help_text=_("The SHA256 digest of the package's METADATA file."), + ) def deferred_validate(self, data): """ @@ -364,6 +369,7 @@ class Meta: "packagetype", "python_version", "sha256", + "metadata_sha256", ) model = python_models.PythonPackageContent diff --git a/pulp_python/app/tasks/publish.py b/pulp_python/app/tasks/publish.py index 3ab44501..39102eb5 100644 --- a/pulp_python/app/tasks/publish.py +++ b/pulp_python/app/tasks/publish.py @@ -101,7 +101,7 @@ def write_simple_api(publication): relative_path = release["filename"] path = f"../../{relative_path}" checksum = release["sha256"] - package_releases.append((relative_path, path, checksum)) + package_releases.append({"filename": relative_path, "url": path, "sha256": checksum}) # Write the final project's page write_project_page( name=canonicalize_name(current_name), diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index 533caba8..365de503 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -1,7 +1,9 @@ +import hashlib import pkginfo import re import shutil import tempfile +import zipfile import json from collections import defaultdict from django.conf import settings @@ -16,15 +18,17 @@ """TODO This serial constant is temporary until Python repositories implements serials""" PYPI_SERIAL_CONSTANT = 1000000000 +SIMPLE_API_VERSION = "1.0" + simple_index_template = """ Simple Index - + {% for name, canonical_name in projects %} - {{ name }}
+ {{ name }}
{% endfor %} @@ -32,16 +36,16 @@ simple_detail_template = """ - - Links for {{ project_name }} - - - + + Links for {{ project_name }} + + +

Links for {{ project_name }}

- {% for name, path, sha256 in project_packages %} - {{ name }}
+ {% for pkg in project_packages %} + {{ pkg.filename }}
{% endfor %} - + """ @@ -128,6 +132,7 @@ def parse_project_metadata(project): # Release metadata "packagetype": project.get("packagetype") or "", "python_version": project.get("python_version") or "", + "metadata_sha256": project.get("metadata_sha256"), } @@ -154,10 +159,8 @@ def parse_metadata(project, version, distribution): package["version"] = version package["url"] = distribution.get("url") or "" package["sha256"] = distribution.get("digests", {}).get("sha256") or "" - package["python_version"] = distribution.get("python_version") or package.get("python_version") - package["requires_python"] = distribution.get("requires_python") or package.get( - "requires_python" - ) # noqa: E501 + package["python_version"] = distribution.get("python_version") or "" + package["requires_python"] = distribution.get("requires_python") or "" return package @@ -175,6 +178,7 @@ def get_project_metadata_from_file(filename): packagetype = DIST_EXTENSIONS[extensions[pkg_type_index]] metadata = DIST_TYPES[packagetype](filename) + metadata.metadata_sha256 = compute_metadata_sha256(filename) metadata.packagetype = packagetype if packagetype == "sdist": metadata.python_version = "source" @@ -187,6 +191,25 @@ def get_project_metadata_from_file(filename): return metadata +def compute_metadata_sha256(filename: str) -> str | None: + """ + Compute SHA256 hash of the metadata file from a Python package. + + Returns SHA256 hash or None if metadata cannot be extracted. + """ + if not filename.endswith(".whl"): + return None + try: + with zipfile.ZipFile(filename, "r") as f: + for file_path in f.namelist(): + if file_path.endswith(".dist-info/METADATA"): + metadata_content = f.read(file_path) + return hashlib.sha256(metadata_content).hexdigest() + except (zipfile.BadZipFile, KeyError, OSError): + pass + return None + + def artifact_to_python_content_data(filename, artifact, domain=None): """ Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent. @@ -403,17 +426,65 @@ def find_artifact(): def write_simple_index(project_names, streamed=False): """Writes the simple index.""" simple = Template(simple_index_template) - context = {"projects": ((x, canonicalize_name(x)) for x in project_names)} + context = { + "SIMPLE_API_VERSION": SIMPLE_API_VERSION, + "projects": ((x, canonicalize_name(x)) for x in project_names), + } return simple.stream(**context) if streamed else simple.render(**context) def write_simple_detail(project_name, project_packages, streamed=False): """Writes the simple detail page of a package.""" detail = Template(simple_detail_template) - context = {"project_name": project_name, "project_packages": project_packages} + context = { + "SIMPLE_API_VERSION": SIMPLE_API_VERSION, + "project_name": project_name, + "project_packages": project_packages, + } return detail.stream(**context) if streamed else detail.render(**context) +def write_simple_index_json(project_names): + """Writes the simple index in JSON format.""" + return { + "meta": {"api-version": SIMPLE_API_VERSION, "_last-serial": PYPI_SERIAL_CONSTANT}, + "projects": [ + {"name": name, "_last-serial": PYPI_SERIAL_CONSTANT} for name in project_names + ], + } + + +def write_simple_detail_json(project_name, project_packages): + """Writes the simple detail page in JSON format.""" + return { + "meta": {"api-version": SIMPLE_API_VERSION, "_last-serial": PYPI_SERIAL_CONSTANT}, + "name": canonicalize_name(project_name), + "files": [ + { + # v1.0, PEP 691 + "filename": package["filename"], + "url": package["url"], + "hashes": {"sha256": package["sha256"]}, + "requires-python": package["requires_python"] or None, + # data-dist-info-metadata is deprecated alias for core-metadata + "data-dist-info-metadata": ( + {"sha256": package["metadata_sha256"]} if package["metadata_sha256"] else False + ), + # yanked and yanked_reason are not implemented because they are mutable + # TODO in the future: + # size, upload-time (v1.1, PEP 700) + # core-metadata (PEP 7.14) + # provenance (v1.3, PEP 740) + } + for package in project_packages + ], + # TODO in the future: + # versions (v1.1, PEP 700) + # alternate-locations (v1.2, PEP 708) + # project-status (v1.4, PEP 792 - pypi and docs differ) + } + + class PackageIncludeFilter: """A special class to help filter Package's based on a remote's include/exclude""" diff --git a/pulp_python/tests/functional/api/test_full_mirror.py b/pulp_python/tests/functional/api/test_full_mirror.py index b2e9b404..f03ae475 100644 --- a/pulp_python/tests/functional/api/test_full_mirror.py +++ b/pulp_python/tests/functional/api/test_full_mirror.py @@ -58,6 +58,24 @@ def test_pull_through_simple(python_remote_factory, python_distribution_factory, assert PYTHON_XS_FIXTURE_CHECKSUMS[package.filename] == package.digests["sha256"] +@pytest.mark.parallel +@pytest.mark.parametrize("media_type", ["application/vnd.pypi.simple.v1+json", "text/html"]) +def test_pull_through_simple_media_types( + media_type, python_remote_factory, python_distribution_factory +): + """Tests pull-through with different media types (JSON and HTML).""" + remote = python_remote_factory(url=PYPI_URL, includes=["shelf-reader"]) + distro = python_distribution_factory(remote=remote.pulp_href) + + url = f"{distro.base_url}simple/shelf-reader/" + headers = {"Accept": media_type} + response = requests.get(url, headers=headers) + + assert response.status_code == 200 + assert media_type in response.headers["Content-Type"] + assert "X-PyPI-Last-Serial" in response.headers + + @pytest.mark.parallel def test_pull_through_filter(python_remote_factory, python_distribution_factory): """Tests that pull-through respects the includes/excludes filter on the remote.""" @@ -66,7 +84,7 @@ def test_pull_through_filter(python_remote_factory, python_distribution_factory) r = requests.get(f"{distro.base_url}simple/pulpcore/") assert r.status_code == 404 - assert r.json() == {"detail": "pulpcore does not exist."} + assert r.text == "404 Not Found" r = requests.get(f"{distro.base_url}simple/shelf-reader/") assert r.status_code == 200 @@ -86,7 +104,7 @@ def test_pull_through_filter(python_remote_factory, python_distribution_factory) r = requests.get(f"{distro.base_url}simple/django/") assert r.status_code == 404 - assert r.json() == {"detail": "django does not exist."} + assert r.text == "404 Not Found" r = requests.get(f"{distro.base_url}simple/pulpcore/") assert r.status_code == 502 diff --git a/pulp_python/tests/functional/api/test_pypi_simple_json_api.py b/pulp_python/tests/functional/api/test_pypi_simple_json_api.py new file mode 100644 index 00000000..b8c7aa3a --- /dev/null +++ b/pulp_python/tests/functional/api/test_pypi_simple_json_api.py @@ -0,0 +1,126 @@ +from urllib.parse import urljoin + +import pytest +import requests + +from pulp_python.tests.functional.constants import ( + PYTHON_EGG_FILENAME, + PYTHON_EGG_URL, + PYTHON_SM_PROJECT_SPECIFIER, + PYTHON_WHEEL_FILENAME, + PYTHON_WHEEL_URL, +) + +API_VERSION = "1.0" +PYPI_SERIAL_CONSTANT = 1000000000 + +PYPI_TEXT_HTML = "text/html" +PYPI_SIMPLE_V1_HTML = "application/vnd.pypi.simple.v1+html" +PYPI_SIMPLE_V1_JSON = "application/vnd.pypi.simple.v1+json" + + +@pytest.mark.parallel +def test_simple_json_index_api( + python_remote_factory, python_repo_with_sync, python_distribution_factory +): + remote = python_remote_factory(includes=PYTHON_SM_PROJECT_SPECIFIER) + repo = python_repo_with_sync(remote) + distro = python_distribution_factory(repository=repo) + + url = urljoin(distro.base_url, "simple/") + headers = {"Accept": PYPI_SIMPLE_V1_JSON} + + response = requests.get(url, headers=headers) + assert response.headers["Content-Type"] == PYPI_SIMPLE_V1_JSON + assert response.headers["X-PyPI-Last-Serial"] == str(PYPI_SERIAL_CONSTANT) + + data = response.json() + assert data["meta"] == {"api-version": API_VERSION, "_last-serial": PYPI_SERIAL_CONSTANT} + assert data["projects"] + for project in data["projects"]: + for i in ["_last-serial", "name"]: + assert i in project + + +def test_simple_json_detail_api( + delete_orphans_pre, + monitor_task, + python_bindings, + python_content_factory, + python_distribution_factory, + python_repo_factory, +): + content_1 = python_content_factory(PYTHON_WHEEL_FILENAME, url=PYTHON_WHEEL_URL) + content_2 = python_content_factory(PYTHON_EGG_FILENAME, url=PYTHON_EGG_URL) + body = {"add_content_units": [content_1.pulp_href, content_2.pulp_href]} + + repo = python_repo_factory() + monitor_task(python_bindings.RepositoriesPythonApi.modify(repo.pulp_href, body).task) + distro = python_distribution_factory(repository=repo) + + url = f'{urljoin(distro.base_url, "simple/")}shelf-reader' + headers = {"Accept": PYPI_SIMPLE_V1_JSON} + + response = requests.get(url, headers=headers) + assert response.headers["Content-Type"] == PYPI_SIMPLE_V1_JSON + assert response.headers["X-PyPI-Last-Serial"] == str(PYPI_SERIAL_CONSTANT) + + data = response.json() + assert data["meta"] == {"api-version": API_VERSION, "_last-serial": PYPI_SERIAL_CONSTANT} + assert data["name"] == "shelf-reader" + assert data["files"] + + # Check data of a wheel + file_whl = next( + (i for i in data["files"] if i["filename"] == "shelf_reader-0.1-py2-none-any.whl"), None + ) + assert file_whl is not None, "wheel file not found" + assert file_whl["url"] + assert file_whl["hashes"] == { + "sha256": "2eceb1643c10c5e4a65970baf63bde43b79cbdac7de81dae853ce47ab05197e9" + } + assert file_whl["requires-python"] is None + assert file_whl["data-dist-info-metadata"] == { + "sha256": "ed333f0db05d77e933a157b7225b403ada9a2f93318d77b41b662eba78bac350" + } + + # Check data of a tarball + file_tar = next((i for i in data["files"] if i["filename"] == "shelf-reader-0.1.tar.gz"), None) + assert file_tar is not None, "tar file not found" + assert file_tar["url"] + assert file_tar["hashes"] == { + "sha256": "04cfd8bb4f843e35d51bfdef2035109bdea831b55a57c3e6a154d14be116398c" + } + assert file_tar["requires-python"] is None + assert file_tar["data-dist-info-metadata"] is False + + +@pytest.mark.parallel +@pytest.mark.parametrize( + "header, result", + [ + (PYPI_TEXT_HTML, PYPI_TEXT_HTML), + (PYPI_SIMPLE_V1_HTML, PYPI_SIMPLE_V1_HTML), + (PYPI_SIMPLE_V1_JSON, PYPI_SIMPLE_V1_JSON), + # Follows defined ordering (html, pypi html, pypi json) + (f"{PYPI_SIMPLE_V1_JSON}, {PYPI_SIMPLE_V1_HTML}", PYPI_SIMPLE_V1_HTML), + # Everything else should be html + ("", PYPI_TEXT_HTML), + ("application/json", PYPI_TEXT_HTML), + ("sth/else", PYPI_TEXT_HTML), + ], +) +def test_simple_api_content_headers( + python_remote_factory, python_repo_with_sync, python_distribution_factory, header, result +): + remote = python_remote_factory(includes=PYTHON_SM_PROJECT_SPECIFIER) + repo = python_repo_with_sync(remote) + distro = python_distribution_factory(repository=repo) + + index_url = urljoin(distro.base_url, "simple/") + detail_url = f"{index_url}aiohttp" + + for url in [index_url, detail_url]: + response = requests.get(url, headers={"Accept": header}) + assert response.status_code == 200 + assert result in response.headers["Content-Type"]