@@ -2410,9 +2410,12 @@ def data_file_statistics_from_parquet_metadata(
24102410 continue
24112411
24122412 if field_id not in col_aggs :
2413- col_aggs [field_id ] = StatsAggregator (
2414- stats_col .iceberg_type , statistics .physical_type , stats_col .mode .length
2415- )
2413+ try :
2414+ col_aggs [field_id ] = StatsAggregator (
2415+ stats_col .iceberg_type , statistics .physical_type , stats_col .mode .length
2416+ )
2417+ except ValueError as e :
2418+ raise ValueError (f"{ e } for column '{ stats_col .column_name } '" ) from e
24162419
24172420 if isinstance (stats_col .iceberg_type , DecimalType ) and statistics .physical_type != "FIXED_LEN_BYTE_ARRAY" :
24182421 scale = stats_col .iceberg_type .scale
@@ -2728,9 +2731,11 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
27282731
27292732 for partition , name in zip (spec .fields , partition_fields ):
27302733 source_field = schema .find_field (partition .source_id )
2731- arrow_table = arrow_table .append_column (
2732- name , partition .transform .pyarrow_transform (source_field .field_type )(arrow_table [source_field .name ])
2733- )
2734+ full_field_name = schema .find_column_name (partition .source_id )
2735+ if full_field_name is None :
2736+ raise ValueError (f"Could not find column name for field ID: { partition .source_id } " )
2737+ field_array = _get_field_from_arrow_table (arrow_table , full_field_name )
2738+ arrow_table = arrow_table .append_column (name , partition .transform .pyarrow_transform (source_field .field_type )(field_array ))
27342739
27352740 unique_partition_fields = arrow_table .select (partition_fields ).group_by (partition_fields ).aggregate ([])
27362741
@@ -2765,3 +2770,32 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
27652770 )
27662771
27672772 return table_partitions
2773+
2774+
2775+ def _get_field_from_arrow_table (arrow_table : pa .Table , field_path : str ) -> pa .Array :
2776+ """Get a field from an Arrow table, supporting both literal field names and nested field paths.
2777+
2778+ This function handles two cases:
2779+ 1. Literal field names that may contain dots (e.g., "some.id")
2780+ 2. Nested field paths using dot notation (e.g., "bar.baz" for nested access)
2781+
2782+ Args:
2783+ arrow_table: The Arrow table containing the field
2784+ field_path: Field name or dot-separated path
2785+
2786+ Returns:
2787+ The field as a PyArrow Array
2788+
2789+ Raises:
2790+ KeyError: If the field path cannot be resolved
2791+ """
2792+ # Try exact column name match (handles field names containing literal dots)
2793+ if field_path in arrow_table .column_names :
2794+ return arrow_table [field_path ]
2795+
2796+ # If not found as exact name, treat as nested field path
2797+ path_parts = field_path .split ("." )
2798+ # Get the struct column from the table (e.g., "bar" from "bar.baz")
2799+ field_array = arrow_table [path_parts [0 ]]
2800+ # Navigate into the struct using the remaining path parts
2801+ return pc .struct_field (field_array , path_parts [1 :])
0 commit comments