11import logging
22import uuid
3+ from collections import defaultdict
34from gettext import gettext as _
5+ from itertools import groupby
46
57from django .db .models .query import QuerySet
68from pulpcore .plugin .models import ProgressReport
79from pulpcore .plugin .util import get_domain
10+ from requests .exceptions import RequestException
811
912from pulp_python .app .models import PythonPackageContent , PythonRepository
10- from pulp_python .app .utils import artifact_to_python_content_data
13+ from pulp_python .app .utils import (
14+ artifact_to_python_content_data ,
15+ fetch_json_release_metadata ,
16+ parse_metadata ,
17+ )
1118
1219log = logging .getLogger (__name__ )
1320
@@ -32,11 +39,16 @@ def repair(repository_pk: uuid.UUID) -> None:
3239 content_set = repository .latest_version ().content .values_list ("pk" , flat = True )
3340 content = PythonPackageContent .objects .filter (pk__in = content_set )
3441
35- num_repaired = repair_metadata (content )
36- log .info (_ ("{} packages' metadata repaired." ).format (num_repaired ))
42+ num_repaired , pkgs_not_repaired = repair_metadata (content )
43+ log .info (
44+ _ (
45+ "{} packages' metadata repaired. Not repaired packages due to either "
46+ "inaccessible URL or mismatched sha256: {}."
47+ ).format (num_repaired , pkgs_not_repaired )
48+ )
3749
3850
39- def repair_metadata (content : QuerySet [PythonPackageContent ]) -> int :
51+ def repair_metadata (content : QuerySet [PythonPackageContent ]) -> tuple [ int , set [ str ]] :
4052 """
4153 Repairs metadata for a queryset of PythonPackageContent objects
4254 and updates the progress report.
@@ -45,25 +57,39 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
4557 content (QuerySet[PythonPackageContent]): The queryset of items to repair.
4658
4759 Returns:
48- int: The number of packages that were repaired.
60+ tuple[int, set[str]]: A tuple containing:
61+ - The number of packages that were repaired.
62+ - A set of primary keys of packages that were not repaired.
4963 """
50- # TODO: Add on_demand content repair
51- immediate_content = content .filter (contentartifact__artifact__isnull = False )
64+ immediate_content = (
65+ content .filter (contentartifact__artifact__isnull = False )
66+ .distinct ()
67+ .prefetch_related ("_artifacts" )
68+ )
69+ on_demand_content = (
70+ content .filter (contentartifact__artifact__isnull = True )
71+ .distinct ()
72+ .prefetch_related ("contentartifact_set__remoteartifact_set" )
73+ .order_by ("name" , "version" )
74+ )
5275 domain = get_domain ()
5376
5477 batch = []
5578 set_of_update_fields = set ()
5679 total_repaired = 0
80+ # Keep track of on-demand packages that need to be repaired
81+ # A package is removed from this variable once it is successfully repaired
82+ pkgs_not_repaired = set (on_demand_content .values_list ("pk" , flat = True ))
5783
5884 progress_report = ProgressReport (
5985 message = "Repairing packages' metadata" ,
6086 code = "repair.metadata" ,
61- total = immediate_content .count (),
87+ total = content .count (),
6288 )
6389 progress_report .save ()
6490 with progress_report :
6591 for package in progress_report .iter (
66- immediate_content .prefetch_related ( "_artifacts" ). iterator (chunk_size = 1000 )
92+ immediate_content .iterator (chunk_size = 1000 )
6793 ):
6894 new_data = artifact_to_python_content_data (
6995 package .filename , package ._artifacts .get (), domain
@@ -82,8 +108,65 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> int:
82108 batch = []
83109 set_of_update_fields .clear ()
84110
111+ # For on-demand content, we expect that:
112+ # 1. PythonPackageContent always has correct name and version, and one ContentArtifact
113+ # 2. RemoteArtifact always has correct sha256
114+ for (name , version ), group in groupby (
115+ on_demand_content .iterator (chunk_size = 1000 ),
116+ key = lambda x : (x .name , x .version ),
117+ ):
118+ group_list = list (group )
119+ grouped_by_url = defaultdict (list )
120+
121+ for package in group_list :
122+ for ra in package .contentartifact_set .get ().remoteartifact_set .all ():
123+ grouped_by_url [ra .remote .url ].append ((package , ra ))
124+
125+ # Prioritize the URL that can serve the most packages
126+ for url , pkg_ra in sorted (
127+ grouped_by_url .items (), key = lambda x : len (x [1 ]), reverse = True
128+ ):
129+ # All packages have the same URL, so pick a random remote
130+ remote = pkg_ra [0 ][1 ].remote
131+ try :
132+ json_data = fetch_json_release_metadata (name , version , remote )
133+ except RequestException :
134+ continue
135+
136+ for package , ra in pkg_ra :
137+ if package .pk not in pkgs_not_repaired :
138+ continue
139+ # Extract data only for the specific distribution being checked
140+ dist_data = None
141+ for dist in json_data ["urls" ]:
142+ if ra .sha256 == dist ["digests" ]["sha256" ]:
143+ dist_data = dist
144+ break
145+ if not dist_data :
146+ continue
147+
148+ new_data = parse_metadata (json_data ["info" ], version , dist_data )
149+ # url belongs to RemoteArtifact
150+ new_data .pop ("url" )
151+ changed = False
152+ for field , value in new_data .items ():
153+ if getattr (package , field ) != value :
154+ setattr (package , field , value )
155+ set_of_update_fields .add (field )
156+ changed = True
157+ if changed :
158+ batch .append (package )
159+ if len (batch ) == 1000 :
160+ total_repaired += len (batch )
161+ PythonPackageContent .objects .bulk_update (
162+ batch , set_of_update_fields
163+ )
164+ batch = []
165+ set_of_update_fields .clear ()
166+ pkgs_not_repaired .remove (package .pk )
167+
85168 if batch :
86169 total_repaired += len (batch )
87170 PythonPackageContent .objects .bulk_update (batch , set_of_update_fields )
88171
89- return total_repaired
172+ return total_repaired , pkgs_not_repaired
0 commit comments