11from __future__ import annotations
22
33import argparse
4+ import email
45import json
56import os
7+ import re
8+ import shutil
69import subprocess
710import sys
11+ import time
12+ import tomllib
13+ from functools import cache
14+ from hashlib import sha256
815from pathlib import Path
916from tempfile import TemporaryDirectory
1017from typing import TypedDict
1118from zipfile import ZipFile
1219
20+ import pathspec
21+ import urllib3
1322from dotenv import dotenv_values
14- from utils import extract_metadata , normalize_project_name
23+ from utils import iter_wheels
24+ from wheel .cli .pack import pack
25+ from wheel .cli .unpack import unpack
1526
1627INDEX_BASE_URL = 'https://agent-int-packages.datadoghq.com'
1728CUSTOM_EXTERNAL_INDEX = f'{ INDEX_BASE_URL } /external'
1829CUSTOM_BUILT_INDEX = f'{ INDEX_BASE_URL } /built'
30+ UNNORMALIZED_PROJECT_NAME_CHARS = re .compile (r'[-_.]+' )
31+
1932
2033class WheelSizes (TypedDict ):
2134 compressed : int
2235 uncompressed : int
2336
37+
2438if sys .platform == 'win32' :
2539 PY3_PATH = Path ('C:\\ py3\\ Scripts\\ python.exe' )
2640 PY2_PATH = Path ('C:\\ py2\\ Scripts\\ python.exe' )
@@ -62,6 +76,161 @@ def check_process(*args, **kwargs) -> subprocess.CompletedProcess:
6276 return process
6377
6478
79+ def extract_metadata (wheel : Path ) -> email .Message :
80+ with ZipFile (str (wheel )) as zip_archive :
81+ for path in zip_archive .namelist ():
82+ root = path .split ('/' , 1 )[0 ]
83+ if root .endswith ('.dist-info' ):
84+ dist_info_dir = root
85+ break
86+ else :
87+ message = f'Could not find the `.dist-info` directory in wheel: { wheel .name } '
88+ raise RuntimeError (message )
89+
90+ try :
91+ with zip_archive .open (f'{ dist_info_dir } /METADATA' ) as zip_file :
92+ metadata_file_contents = zip_file .read ().decode ('utf-8' )
93+ except KeyError :
94+ message = f'Could not find a `METADATA` file in the `{ dist_info_dir } ` directory'
95+ raise RuntimeError (message ) from None
96+
97+ return email .message_from_string (metadata_file_contents )
98+
99+
100+ def normalize_project_name (name : str ) -> str :
101+ # https://peps.python.org/pep-0503/#normalized-names
102+ return UNNORMALIZED_PROJECT_NAME_CHARS .sub ('-' , name ).lower ()
103+
104+
105+ @cache
106+ def get_wheel_hashes (project ) -> dict [str , str ]:
107+ retry_wait = 2
108+ while True :
109+ try :
110+ response = urllib3 .request (
111+ 'GET' ,
112+ f'https://pypi.org/simple/{ project } ' ,
113+ headers = {"Accept" : "application/vnd.pypi.simple.v1+json" },
114+ )
115+ except urllib3 .exceptions .HTTPError as e :
116+ err_msg = f'Failed to fetch hashes for `{ project } `: { e } '
117+ else :
118+ if response .status == 200 :
119+ break
120+
121+ err_msg = f'Failed to fetch hashes for `{ project } `, status code: { response .status } '
122+
123+ print (err_msg )
124+ print (f'Retrying in { retry_wait } seconds' )
125+ time .sleep (retry_wait )
126+ retry_wait *= 2
127+ continue
128+
129+ data = response .json ()
130+ return {
131+ file ['filename' ]: file ['hashes' ]['sha256' ]
132+ for file in data ['files' ]
133+ if file ['filename' ].endswith ('.whl' ) and 'sha256' in file ['hashes' ]
134+ }
135+
136+
137+ def wheel_was_built (wheel : Path ) -> bool :
138+ project_metadata = extract_metadata (wheel )
139+ project_name = normalize_project_name (project_metadata ['Name' ])
140+ wheel_hashes = get_wheel_hashes (project_name )
141+ if wheel .name not in wheel_hashes :
142+ return True
143+
144+ file_hash = sha256 (wheel .read_bytes ()).hexdigest ()
145+ return file_hash != wheel_hashes [wheel .name ]
146+
147+
148+ def remove_test_files (wheel_path : Path ) -> bool :
149+ '''
150+ Unpack the wheel, remove excluded test files, then repack it to rebuild RECORD correctly.
151+ '''
152+ # First, check whether the wheel contains any files that should be excluded. If not, leave it untouched.
153+ with ZipFile (wheel_path , 'r' ) as zf :
154+ excluded_members = [name for name in zf .namelist () if is_excluded_from_wheel (name )]
155+
156+ if not excluded_members :
157+ # Nothing to strip, so skip rewriting the wheel
158+ return False
159+ with TemporaryDirectory () as td :
160+ td_path = Path (td )
161+
162+ # Unpack the wheel into temp dir
163+ unpack (wheel_path , dest = td_path )
164+ unpacked_dir = next (td_path .iterdir ())
165+ # Remove excluded files/folders
166+ for root , dirs , files in os .walk (td , topdown = False ):
167+ for d in list (dirs ):
168+ full_dir = Path (root ) / d
169+ rel = full_dir .relative_to (unpacked_dir ).as_posix ()
170+ if is_excluded_from_wheel (rel ):
171+ shutil .rmtree (full_dir )
172+ dirs .remove (d )
173+ for f in files :
174+ rel = Path (root ).joinpath (f ).relative_to (unpacked_dir ).as_posix ()
175+ if is_excluded_from_wheel (rel ):
176+ os .remove (Path (root ) / f )
177+
178+ print (f'Tests removed from { wheel_path .name } ' )
179+
180+ dest_dir = wheel_path .parent
181+ before = {p .resolve () for p in dest_dir .glob ("*.whl" )}
182+ # Repack to same directory, regenerating RECORD
183+ pack (unpacked_dir , dest_dir = dest_dir , build_number = None )
184+
185+ # The wheel might not be platform-specific, so repacking restores its original name.
186+ # We need to move the repacked wheel to wheel_path, which was changed to be platform-specific.
187+ after = {p .resolve () for p in wheel_path .parent .glob ("*.whl" )}
188+ new_files = sorted (after - before , key = lambda p : p .stat ().st_mtime , reverse = True )
189+
190+ if new_files :
191+ shutil .move (str (new_files [0 ]), str (wheel_path ))
192+
193+ return True
194+
195+
196+ @cache
197+ def _load_excluded_spec () -> pathspec .PathSpec :
198+ """
199+ Load excluded paths from files_to_remove.toml and compile them
200+ with .gitignore-style semantics.
201+ """
202+ config_path = Path (__file__ ).parent / "files_to_remove.toml"
203+ with open (config_path , "rb" ) as f :
204+ config = tomllib .load (f )
205+
206+ patterns = config .get ("excluded_paths" , [])
207+ return pathspec .PathSpec .from_lines ("gitignore" , patterns )
208+
209+
210+ def is_excluded_from_wheel (path : str | Path ) -> bool :
211+ """
212+ Return True if `path` (file or directory) should be excluded per files_to_remove.toml.
213+ Matches:
214+ - type annotation files: **/*.pyi, **/py.typed
215+ - test directories listed with a trailing '/'
216+ """
217+ spec = _load_excluded_spec ()
218+ rel = Path (path ).as_posix ()
219+
220+ if spec .match_file (rel ) or spec .match_file (rel + "/" ):
221+ return True
222+
223+ return False
224+
225+
226+ def add_dependency (dependencies : dict [str , str ], sizes : dict [str , WheelSizes ], wheel : Path ) -> None :
227+ project_metadata = extract_metadata (wheel )
228+ project_name = normalize_project_name (project_metadata ['Name' ])
229+ project_version = project_metadata ['Version' ]
230+ dependencies [project_name ] = project_version
231+ sizes [project_name ] = {'version' : project_version , ** calculate_wheel_sizes (wheel )}
232+
233+
65234def calculate_wheel_sizes (wheel_path : Path ) -> WheelSizes :
66235 compressed_size = wheel_path .stat (follow_symlinks = True ).st_size
67236 with ZipFile (wheel_path ) as zf :
@@ -92,6 +261,13 @@ def main():
92261
93262 with TemporaryDirectory () as d :
94263 staged_wheel_dir = Path (d ).resolve ()
264+ staged_built_wheels_dir = staged_wheel_dir / 'built'
265+ staged_external_wheels_dir = staged_wheel_dir / 'external'
266+
267+ # Create the directories
268+ staged_built_wheels_dir .mkdir (parents = True , exist_ok = True )
269+ staged_external_wheels_dir .mkdir (parents = True , exist_ok = True )
270+
95271 env_vars = dict (os .environ )
96272 env_vars ['PATH' ] = f'{ python_path .parent } { os .pathsep } { env_vars ["PATH" ]} '
97273 env_vars ['PIP_WHEEL_DIR' ] = str (staged_wheel_dir )
@@ -131,25 +307,29 @@ def main():
131307 str (MOUNT_DIR / 'requirements.in' ),
132308 '--wheel-dir' ,
133309 str (staged_wheel_dir ),
134- # Temporarily removing extra index urls. See below.
135- # '--extra-index-url', CUSTOM_EXTERNAL_INDEX,
310+ '-- extra- index-url' ,
311+ CUSTOM_EXTERNAL_INDEX ,
136312 ]
137- # Temporarily disable extra index urls. There are broken wheels in the gcloud bucket
138- # while working on removing tests from them. Adding extra indices causes undefined behavior
139- # and can pull a broken image, preventing the building from running.
140- # if args.use_built_index:
141- # command_args.extend(['--extra-index-url', CUSTOM_BUILT_INDEX])
142313
143314 check_process (command_args , env = env_vars )
144315
316+ # Classify wheels
317+ for wheel in iter_wheels (staged_wheel_dir ):
318+ if wheel_was_built (wheel ):
319+ shutil .move (wheel , staged_built_wheels_dir )
320+ else :
321+ shutil .move (wheel , staged_external_wheels_dir )
322+
145323 # Repair wheels
146324 check_process (
147325 [
148326 sys .executable ,
149327 '-u' ,
150328 str (MOUNT_DIR / 'scripts' / 'repair_wheels.py' ),
151- '--source-dir' ,
152- str (staged_wheel_dir ),
329+ '--source-built-dir' ,
330+ str (staged_built_wheels_dir ),
331+ '--source-external-dir' ,
332+ str (staged_external_wheels_dir ),
153333 '--built-dir' ,
154334 str (built_wheels_dir ),
155335 '--external-dir' ,
@@ -160,15 +340,22 @@ def main():
160340 dependencies : dict [str , tuple [str , str ]] = {}
161341 sizes : dict [str , WheelSizes ] = {}
162342
163- for wheel_dir in wheels_dir .iterdir ():
164- for wheel in wheel_dir .iterdir ():
165- project_metadata = extract_metadata (wheel )
166- project_name = normalize_project_name (project_metadata ['Name' ])
167- project_version = project_metadata ['Version' ]
168- dependencies [project_name ] = project_version
169-
170-
171- sizes [project_name ] = {'version' : project_version , ** calculate_wheel_sizes (wheel )}
343+ # Handle wheels currently in the external directory and move them to the built directory if they were modified
344+ for wheel in iter_wheels (external_wheels_dir ):
345+ was_modified = remove_test_files (wheel )
346+ if was_modified :
347+ # A modified wheel is no longer external → move it to built directory
348+ new_path = built_wheels_dir / wheel .name
349+ wheel .rename (new_path )
350+ wheel = new_path
351+ print (f'Moved { wheel .name } to built directory' )
352+
353+ add_dependency (dependencies , sizes , wheel )
354+
355+ # Handle wheels already in the built directory
356+ for wheel in iter_wheels (built_wheels_dir ):
357+ remove_test_files (wheel )
358+ add_dependency (dependencies , sizes , wheel )
172359
173360 output_path = MOUNT_DIR / 'sizes.json'
174361 with output_path .open ('w' , encoding = 'utf-8' ) as fp :
0 commit comments