@@ -523,9 +523,11 @@ def history(self) -> "pa.Table":
523523
524524 return pa .Table .from_pylist (history , schema = history_schema )
525525
526- def _files_by_manifest (
526+ def _get_files_from_manifest (
527527 self , manifest_list : ManifestFile , data_file_filter : Optional [Set [DataFileContent ]] = None
528- ) -> List [Dict [str , Any ]]:
528+ ) -> "pa.Table" :
529+ import pyarrow as pa
530+
529531 files : list [dict [str , Any ]] = []
530532 schema = self .tbl .metadata .schema ()
531533 io = self .tbl .io
@@ -576,7 +578,10 @@ def _files_by_manifest(
576578 "readable_metrics" : readable_metrics ,
577579 }
578580 )
579- return files
581+ return pa .Table .from_pylist (
582+ files ,
583+ schema = self ._get_files_schema (),
584+ )
580585
581586 def _get_files_schema (self ) -> "pa.Schema" :
582587 import pyarrow as pa
@@ -630,23 +635,20 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
630635 def _files (self , snapshot_id : Optional [int ] = None , data_file_filter : Optional [Set [DataFileContent ]] = None ) -> "pa.Table" :
631636 import pyarrow as pa
632637
633- files : list [dict [ str , Any ] ] = []
638+ files_table : list [pa . Table ] = []
634639
635640 if not snapshot_id and not self .tbl .metadata .current_snapshot ():
636641 return pa .Table .from_pylist (
637- files ,
642+ [] ,
638643 schema = self ._get_files_schema (),
639644 )
640645 snapshot = self ._get_snapshot (snapshot_id )
641646
642647 io = self .tbl .io
643648 for manifest_list in snapshot .manifests (io ):
644- files . extend (self ._files_by_manifest (manifest_list , data_file_filter ))
649+ files_table . append (self ._get_files_from_manifest (manifest_list , data_file_filter ))
645650
646- return pa .Table .from_pylist (
647- files ,
648- schema = self ._get_files_schema (),
649- )
651+ return pa .concat_tables (files_table )
650652
651653 def files (self , snapshot_id : Optional [int ] = None ) -> "pa.Table" :
652654 return self ._files (snapshot_id )
@@ -678,21 +680,16 @@ def _all_files(self, data_file_filter: Optional[Set[DataFileContent]] = None) ->
678680 return pa .Table .from_pylist ([], schema = self ._get_files_schema ())
679681
680682 executor = ExecutorFactory .get_or_create ()
681- all_manifest_files_by_snapshot : Iterator [List [ManifestFile ]] = executor .map (
682- lambda args : args [0 ].manifests (self .tbl .io ), [(snapshot ,) for snapshot in snapshots ]
683- )
684- all_manifest_files = list (
685- {(manifest .manifest_path , manifest ) for manifest_list in all_manifest_files_by_snapshot for manifest in manifest_list }
686- )
687- all_files_by_manifest : Iterator [List [Dict [str , Any ]]] = executor .map (
688- lambda args : self ._files_by_manifest (* args ), [(manifest , data_file_filter ) for _ , manifest in all_manifest_files ]
689- )
690- all_files_list = [file for files in all_files_by_manifest for file in files ]
691- return pa .Table .from_pylist (
692- all_files_list ,
693- schema = self ._get_files_schema (),
683+ manifest_lists = executor .map (lambda snapshot : snapshot .manifests (self .tbl .io ), snapshots )
684+
685+ unique_manifests = {(manifest .manifest_path , manifest ) for manifest_list in manifest_lists for manifest in manifest_list }
686+
687+ file_lists = executor .map (
688+ lambda args : self ._get_files_from_manifest (* args ), [(manifest , data_file_filter ) for _ , manifest in unique_manifests ]
694689 )
695690
691+ return pa .concat_tables (file_lists )
692+
696693 def all_files (self ) -> "pa.Table" :
697694 return self ._all_files ()
698695
0 commit comments