11import logging
2+ from collections import defaultdict
23from gettext import gettext as _
4+ from itertools import groupby
35from uuid import UUID
46
7+ from django .db .models import Prefetch
58from django .db .models .query import QuerySet
69from pulp_python .app .models import PythonPackageContent , PythonRepository
7- from pulp_python .app .utils import artifact_to_python_content_data
8- from pulpcore .plugin .models import ProgressReport
9- from pulpcore .plugin .util import get_domain
10+ from pulp_python .app .utils import (
11+ artifact_to_python_content_data ,
12+ fetch_json_release_metadata ,
13+ parse_metadata ,
14+ )
15+ from pulpcore .plugin .models import ContentArtifact , ProgressReport
16+ from pulpcore .plugin .util import get_domain , get_prn
1017
1118log = logging .getLogger (__name__ )
1219
@@ -34,11 +41,16 @@ def repair(repository_pk: UUID) -> None:
3441 content_set = repository .latest_version ().content .values_list ("pk" , flat = True )
3542 content = PythonPackageContent .objects .filter (pk__in = content_set )
3643
37- num_repaired = repair_metadata (content )
38- log .info (_ ("{} packages' metadata repaired." ).format (num_repaired ))
44+ num_repaired , pkgs_not_repaired = repair_metadata (content )
45+ log .info (
46+ _ (
47+ "{} packages' metadata repaired. Not repaired packages due to either "
48+ "inaccessible URL or mismatched sha256: {}."
49+ ).format (num_repaired , pkgs_not_repaired )
50+ )
3951
4052
41- def repair_metadata (content : QuerySet [PythonPackageContent ]) -> int :
53+ def repair_metadata (content : QuerySet [PythonPackageContent ]) -> tuple [ int , set [ str ]] :
4254 """
4355 Repairs metadata for a queryset of PythonPackageContent objects
4456 and updates the progress report.
@@ -47,24 +59,38 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4759 content (QuerySet[PythonPackageContent]): The queryset of items to repair.
4860
4961 Returns:
50- int: The number of packages that were repaired.
62+ tuple[int, set[str]]: A tuple containing:
63+ - The number of packages that were repaired.
64+ - A set of packages' PRNs that were not repaired.
5165 """
52- # TODO: Add on_demand content repair
5366 immediate_content = (
5467 content .filter (contentartifact__artifact__isnull = False )
5568 .distinct ()
5669 .prefetch_related ("_artifacts" )
5770 )
71+ on_demand_content = (
72+ content .filter (contentartifact__artifact__isnull = True )
73+ .distinct ()
74+ .prefetch_related (
75+ Prefetch (
76+ "contentartifact_set" ,
77+ queryset = ContentArtifact .objects .prefetch_related ("remoteartifact_set" ),
78+ )
79+ )
80+ .order_by ("name" , "version" )
81+ )
5882 domain = get_domain ()
5983
6084 batch = []
6185 set_of_update_fields = set ()
6286 total_repaired = 0
87+ # Keep track of on-demand packages that were not repaired
88+ pkgs_not_repaired = set ()
6389
6490 progress_report = ProgressReport (
6591 message = "Repairing packages' metadata" ,
6692 code = "repair.metadata" ,
67- total = immediate_content .count (),
93+ total = content .count (),
6894 )
6995 progress_report .save ()
7096 with progress_report :
@@ -78,11 +104,60 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
78104 package , new_data , batch , set_of_update_fields
79105 )
80106
107+ # For on-demand content, we expect that:
108+ # 1. PythonPackageContent always has correct name and version
109+ # 2. RemoteArtifact always has correct sha256
110+ for (name , version ), group in groupby (
111+ on_demand_content .iterator (chunk_size = BULK_SIZE ),
112+ key = lambda x : (x .name , x .version ),
113+ ):
114+ group_set = set (group )
115+ grouped_by_url = defaultdict (list )
116+
117+ for package in group_set :
118+ for ra in package .contentartifact_set .get ().remoteartifact_set .all ():
119+ grouped_by_url [ra .remote .url ].append ((package , ra ))
120+
121+ # Prioritize the URL that can serve the most packages
122+ for url , pkg_ra_pairs in sorted (
123+ grouped_by_url .items (), key = lambda x : len (x [1 ]), reverse = True
124+ ):
125+ if not group_set :
126+ break # No packages left to repair, move onto the next group
127+ remotes = set ([pkg_ra [1 ].remote for pkg_ra in pkg_ra_pairs ])
128+ try :
129+ json_data = fetch_json_release_metadata (name , version , remotes )
130+ except Exception :
131+ continue
132+
133+ for package , ra in pkg_ra_pairs :
134+ if package not in group_set :
135+ continue # Package was already repaired
136+ # Extract data only for the specific distribution being checked
137+ dist_data = None
138+ for dist in json_data ["urls" ]:
139+ if ra .sha256 == dist ["digests" ]["sha256" ]:
140+ dist_data = dist
141+ break
142+ if not dist_data :
143+ continue
144+
145+ new_data = parse_metadata (json_data ["info" ], version , dist_data )
146+ new_data .pop ("url" ) # url belongs to RemoteArtifact
147+ total_repaired += update_package_if_needed (
148+ package , new_data , batch , set_of_update_fields
149+ )
150+ group_set .remove (package )
151+ progress_report .increment ()
152+ # Store and track the unrepaired packages after all URLs are processed
153+ pkgs_not_repaired .update ([get_prn (p ) for p in group_set ])
154+ progress_report .increase_by (len (group_set ))
155+
81156 if batch :
82157 total_repaired += len (batch )
83158 PythonPackageContent .objects .bulk_update (batch , set_of_update_fields )
84159
85- return total_repaired
160+ return total_repaired , pkgs_not_repaired
86161
87162
88163def update_package_if_needed (
0 commit comments