diff --git a/CHANGES/849.feature b/CHANGES/849.feature new file mode 100644 index 00000000..f426086e --- /dev/null +++ b/CHANGES/849.feature @@ -0,0 +1 @@ +Added support for on-demand content to `repair_metadata` endpoint. diff --git a/pulp_python/app/tasks/repair.py b/pulp_python/app/tasks/repair.py index c1fa6a71..7ba26075 100644 --- a/pulp_python/app/tasks/repair.py +++ b/pulp_python/app/tasks/repair.py @@ -1,23 +1,32 @@ import logging -import uuid +from collections import defaultdict from gettext import gettext as _ +from itertools import groupby +from uuid import UUID +from django.db.models import Prefetch from django.db.models.query import QuerySet -from pulpcore.plugin.models import ProgressReport -from pulpcore.plugin.util import get_domain - from pulp_python.app.models import PythonPackageContent, PythonRepository -from pulp_python.app.utils import artifact_to_python_content_data +from pulp_python.app.utils import ( + artifact_to_python_content_data, + fetch_json_release_metadata, + parse_metadata, +) +from pulpcore.plugin.models import ContentArtifact, ProgressReport +from pulpcore.plugin.util import get_domain log = logging.getLogger(__name__) -def repair(repository_pk: uuid.UUID) -> None: +BULK_SIZE = 1000 + + +def repair(repository_pk: UUID) -> None: """ Repairs metadata of all packages for the specified repository. Args: - repository_pk (uuid.UUID): The primary key of the repository to repair. + repository_pk (UUID): The primary key of the repository to repair. Returns: None @@ -32,11 +41,16 @@ def repair(repository_pk: uuid.UUID) -> None: content_set = repository.latest_version().content.values_list("pk", flat=True) content = PythonPackageContent.objects.filter(pk__in=content_set) - num_repaired = repair_metadata(content) - log.info(_("{} packages' metadata repaired.").format(num_repaired)) + num_repaired, pkgs_not_repaired = repair_metadata(content) + log.info( + _( + "{} packages' metadata repaired. Not repaired packages due to either " + "inaccessible URL or mismatched sha256: {}." + ).format(num_repaired, pkgs_not_repaired) + ) -def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: +def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]: """ Repairs metadata for a queryset of PythonPackageContent objects and updates the progress report. @@ -45,45 +59,140 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int: content (QuerySet[PythonPackageContent]): The queryset of items to repair. Returns: - int: The number of packages that were repaired. + tuple[int, set[str]]: A tuple containing: + - The number of packages that were repaired. + - A set of packages' PKs that were not repaired. """ - # TODO: Add on_demand content repair - immediate_content = content.filter(contentartifact__artifact__isnull=False) + immediate_content = ( + content.filter(contentartifact__artifact__isnull=False) + .distinct() + .prefetch_related("_artifacts") + ) + on_demand_content = ( + content.filter(contentartifact__artifact__isnull=True) + .distinct() + .prefetch_related( + Prefetch( + "contentartifact_set", + queryset=ContentArtifact.objects.prefetch_related("remoteartifact_set"), + ) + ) + .order_by("name", "version") + ) domain = get_domain() batch = [] set_of_update_fields = set() total_repaired = 0 + # Keep track of on-demand packages that were not repaired + pkgs_not_repaired = set() progress_report = ProgressReport( message="Repairing packages' metadata", code="repair.metadata", - total=immediate_content.count(), + total=content.count(), ) progress_report.save() with progress_report: for package in progress_report.iter( - immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000) + immediate_content.iterator(chunk_size=BULK_SIZE) ): new_data = artifact_to_python_content_data( package.filename, package._artifacts.get(), domain ) - changed = False - for field, value in new_data.items(): - if getattr(package, field) != value: - setattr(package, field, value) - set_of_update_fields.add(field) - changed = True - if changed: - batch.append(package) - if len(batch) == 1000: - total_repaired += len(batch) - PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) - batch = [] - set_of_update_fields.clear() + total_repaired += update_package_if_needed( + package, new_data, batch, set_of_update_fields + ) + + # For on-demand content, we expect that: + # 1. PythonPackageContent always has correct name and version + # 2. RemoteArtifact always has correct sha256 + for (name, version), group in groupby( + on_demand_content.iterator(chunk_size=BULK_SIZE), + key=lambda x: (x.name, x.version), + ): + group_set = set(group) + grouped_by_url = defaultdict(list) + + for package in group_set: + for ra in package.contentartifact_set.get().remoteartifact_set.all(): + grouped_by_url[ra.remote.url].append((package, ra)) + + # Prioritize the URL that can serve the most packages + for url, pkg_ra_pairs in sorted( + grouped_by_url.items(), key=lambda x: len(x[1]), reverse=True + ): + if not group_set: + break # No packages left to repair, move onto the next group + remotes = set([pkg_ra[1].remote for pkg_ra in pkg_ra_pairs]) + try: + json_data = fetch_json_release_metadata(name, version, remotes) + except Exception: + continue + + for package, ra in pkg_ra_pairs: + if package not in group_set: + continue # Package was already repaired + # Extract data only for the specific distribution being checked + dist_data = None + for dist in json_data["urls"]: + if ra.sha256 == dist["digests"]["sha256"]: + dist_data = dist + break + if not dist_data: + continue + + new_data = parse_metadata(json_data["info"], version, dist_data) + new_data.pop("url") # url belongs to RemoteArtifact + total_repaired += update_package_if_needed( + package, new_data, batch, set_of_update_fields + ) + group_set.remove(package) + progress_report.increment() + # Store and track the unrepaired packages after all URLs are processed + pkgs_not_repaired.update([p.pk for p in group_set]) + progress_report.increase_by(len(group_set)) if batch: total_repaired += len(batch) PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) + return total_repaired, pkgs_not_repaired + + +def update_package_if_needed( + package: PythonPackageContent, + new_data: dict, + batch: list[PythonPackageContent], + set_of_update_fields: set[str], +) -> int: + """ + Compares the current package data with new data and updates the package + if needed ("batch" and "set_of_update_fields" are updated in-place). + + Args: + package: Package to check and update. + new_data: A dict of new field values to compare against the package. + batch: A list of packages that were updated. + set_of_update_fields: A set of package field names that were updated. + + Returns: + The count of repaired packages (increments in multiples of BULK_SIZE only). + """ + total_repaired = 0 + changed = False + for field, value in new_data.items(): + if getattr(package, field) != value: + setattr(package, field, value) + set_of_update_fields.add(field) + changed = True + if changed: + batch.append(package) + + if len(batch) == BULK_SIZE: + PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) + total_repaired += BULK_SIZE + batch.clear() + set_of_update_fields.clear() + return total_repaired diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index 8baf0e9d..89d1c3b4 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -9,6 +9,7 @@ from packaging.utils import canonicalize_name from packaging.requirements import Requirement from packaging.version import parse, InvalidVersion +from pulpcore.plugin.models import Remote PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL" @@ -189,6 +190,37 @@ def artifact_to_python_content_data(filename, artifact, domain=None): return data +def fetch_json_release_metadata(name: str, version: str, remotes: set[Remote]) -> dict: + """ + Fetches metadata for a specific release from PyPI's JSON API. A release can contain + multiple distributions. See https://docs.pypi.org/api/json/#get-a-release for more details. + All remotes should have the same URL. + + Returns: + Dict containing "info", "last_serial", "urls", and "vulnerabilities" keys. + Raises: + Exception if fetching from all remote URLs fails. + """ + remote = next(iter(remotes)) + url = remote.get_remote_artifact_url(f"pypi/{name}/{version}/json") + + result = None + for remote in remotes: + downloader = remote.get_downloader(url=url, max_retries=1) + try: + result = downloader.fetch() + break + except Exception: + continue + + if result: + with open(result.path, "r") as file: + json_data = json.load(file) + return json_data + else: + raise Exception(f"Failed to fetch {url} from any remote.") + + def python_content_to_json(base_path, content_query, version=None, domain=None): """ Converts a QuerySet of PythonPackageContent into the PyPi JSON format diff --git a/pulp_python/tests/functional/api/test_repair.py b/pulp_python/tests/functional/api/test_repair.py index 4b2bce55..5a6930f5 100644 --- a/pulp_python/tests/functional/api/test_repair.py +++ b/pulp_python/tests/functional/api/test_repair.py @@ -10,16 +10,16 @@ @pytest.fixture def create_content_direct(python_bindings): - def _create(artifact_filename, filename, content_data): + def _create(artifact_filename, content_data): commands = ( "from pulpcore.plugin.models import Artifact, ContentArtifact; " "from pulpcore.plugin.util import get_url; " "from pulp_python.app.models import PythonPackageContent; " f"a = Artifact.init_and_validate('{artifact_filename}'); " "a.save(); " - f"c = PythonPackageContent(sha256=a.sha256, filename={filename!r}, **{content_data!r}); " # noqa: E501 + f"c = PythonPackageContent(sha256=a.sha256, **{content_data!r}); " "c.save(); " - f"ca = ContentArtifact(artifact=a, content=c, relative_path={filename!r}); " + f"ca = ContentArtifact(artifact=a, content=c, relative_path=c.filename); " "ca.save(); " "print(get_url(c))" ) @@ -32,6 +32,39 @@ def _create(artifact_filename, filename, content_data): return _create +@pytest.fixture +def create_content_remote(python_bindings): + def _create(content, remote, remote_2=None): + commands = ( + "from pulpcore.plugin.models import ContentArtifact, RemoteArtifact; " + "from pulpcore.plugin.util import extract_pk, get_url; " + "from pulp_python.app.models import PythonPackageContent, PythonRemote; " + f"c = PythonPackageContent(**{content!r}); " + "c.save(); " + f"ca = ContentArtifact(content=c, relative_path=c.filename); " + "ca.save(); " + f"r = PythonRemote.objects.get(pk=extract_pk({remote.pulp_href!r})); " + f"ra = RemoteArtifact(content_artifact=ca, remote=r, sha256=c.sha256); " + "ra.save(); " + ) + if remote_2: + commands += ( + f"r2 = PythonRemote.objects.get(pk=extract_pk({remote_2.pulp_href!r})); " + f"ra2 = RemoteArtifact(content_artifact=ca, remote=r2, sha256=c.sha256); " + "ra2.save(); " + ) + commands += "print(get_url(c))" + process = subprocess.run( + ["pulpcore-manager", "shell", "-c", commands], capture_output=True + ) + + assert process.returncode == 0 + content_href = process.stdout.decode().strip() + return python_bindings.ContentPackagesApi.read(content_href) + + return _create + + @pytest.fixture def move_to_repository(python_bindings, monitor_task): def _move(repo_href, content_hrefs): @@ -54,13 +87,14 @@ def test_metadata_repair_command( """Test pulpcore-manager repair-python-metadata command.""" data = { "name": "shelf-reader", + "filename": PYTHON_EGG_FILENAME, # Wrong metadata "version": "0.2", "packagetype": "bdist", "requires_python": ">=3.8", "author": "ME", } - content = create_content_direct(python_file, PYTHON_EGG_FILENAME, data) + content = create_content_direct(python_file, data) for field, wrong_value in data.items(): if field == "python_version": continue @@ -84,43 +118,111 @@ def test_metadata_repair_command( def test_metadata_repair_endpoint( create_content_direct, + create_content_remote, + delete_orphans_pre, download_python_file, monitor_task, move_to_repository, python_bindings, - python_repo, + python_remote_factory, + python_repo_factory, ): """ Test repairing of package metadata via `Repositories.repair_metadata` endpoint. """ - python_egg_filename = "scipy-1.1.0.tar.gz" - python_egg_url = urljoin( - urljoin(PYTHON_FIXTURES_URL, "packages/"), python_egg_filename + # 1. Setup tested data + # Shared data + python_remote = python_remote_factory() + python_remote_bad = python_remote_factory(url="https://fixtures.pulpproject.org/") + python_repo = python_repo_factory(remote=python_remote) + + # Immediate content + scipy_egg_filename = "scipy-1.1.0-cp27-none-win32.whl" + scipy_egg_url = urljoin( + urljoin(PYTHON_FIXTURES_URL, "packages/"), scipy_egg_filename ) - python_file = download_python_file(python_egg_filename, python_egg_url) + scipy_file = download_python_file(scipy_egg_filename, scipy_egg_url) + scipy_data_0 = { + "filename": scipy_egg_filename, + "name": "scipy", + "version": "1.1.0", + # Wrong metadata + "author": "ME", + "packagetype": "bdist", + "requires_python": ">=3.8", + } - data = { + # On-demand content + celery_data = { + "filename": "celery-2.4.1.tar.gz", + "name": "celery", + "version": "2.4.1", + "sha256": "c77652ca179d14473975822dbfb1b5dab950c88c171ef6bc2257ddb9066e6790", + # Wrong metadata + "author": "ME", + "packagetype": "bdist", + "requires_python": ">=3.8", + } + + scipy_data_1 = { + "filename": "scipy-1.1.0.tar.gz", "name": "scipy", + "version": "1.1.0", + "sha256": "878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1", # Wrong metadata "author": "ME", "packagetype": "bdist", "requires_python": ">=3.8", - "version": "0.2", } - content = create_content_direct(python_file, python_egg_filename, data) - for field, wrong_value in data.items(): - if field == "python_version": - continue - assert getattr(content, field) == wrong_value - move_to_repository(python_repo.pulp_href, [content.pulp_href]) + scipy_data_2 = scipy_data_1.copy() + scipy_data_2["filename"] = "scipy-1.1.0-cp36-none-win32.whl" + scipy_data_2["sha256"] = ( + "0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3" + ) + + # 2. Create content + celery_content = create_content_remote(celery_data, python_remote) + scipy_content_0 = create_content_direct(scipy_file, scipy_data_0) + scipy_content_1 = create_content_remote( + scipy_data_1, python_remote, python_remote_bad + ) + scipy_content_2 = create_content_remote(scipy_data_2, python_remote_bad) + + content_hrefs = {} + for data, content in [ + (celery_data, celery_content), + (scipy_data_0, scipy_content_0), + (scipy_data_1, scipy_content_1), + (scipy_data_2, scipy_content_2), + ]: + for field, test_value in data.items(): + assert getattr(content, field) == test_value + content_hrefs[data["filename"]] = content.pulp_href + move_to_repository(python_repo.pulp_href, list(content_hrefs.values())) + + # 3. Repair metadata response = python_bindings.RepositoriesPythonApi.repair_metadata( python_repo.pulp_href ) monitor_task(response.task) - content = python_bindings.ContentPackagesApi.read(content.pulp_href) - assert content.version == "1.1.0" - assert content.packagetype == "sdist" - assert content.requires_python == ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*" - assert content.author == "" + # 4. Check new metadata + new_metadata = [ + # repaired + ("celery-2.4.1.tar.gz", "Ask Solem", "sdist", ""), + ( + "scipy-1.1.0-cp27-none-win32.whl", + "", + "bdist_wheel", + ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", + ), + ("scipy-1.1.0.tar.gz", "", "sdist", ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"), + # not repaired + ("scipy-1.1.0-cp36-none-win32.whl", "ME", "bdist", ">=3.8"), + ] + for filename, author, packagetype, requires_python in new_metadata: + new_content = python_bindings.ContentPackagesApi.read(content_hrefs[filename]) + assert new_content.author == author + assert new_content.packagetype == packagetype + assert new_content.requires_python == requires_python