Skip to content

Commit 531e19c

Browse files
committed
Merge branch 'main' of github.com:apache/iceberg-python into fd-rust
2 parents 974e2e3 + 00222a4 commit 531e19c

File tree

24 files changed

+1174
-6725
lines changed

24 files changed

+1174
-6725
lines changed

.github/workflows/pypi-build-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ jobs:
6262
if: startsWith(matrix.os, 'ubuntu')
6363

6464
- name: Build wheels
65-
uses: pypa/cibuildwheel@v3.0.1
65+
uses: pypa/cibuildwheel@v3.1.3
6666
with:
6767
output-dir: wheelhouse
6868
config-file: "pyproject.toml"

.github/workflows/python-ci.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,6 @@ jobs:
5656
- uses: actions/setup-python@v5
5757
with:
5858
python-version: ${{ matrix.python }}
59-
cache: poetry
60-
cache-dependency-path: ./poetry.lock
6159
- name: Install system dependencies
6260
run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
6361
- name: Install

.github/workflows/svn-build-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ jobs:
5757
if: startsWith(matrix.os, 'ubuntu')
5858

5959
- name: Build wheels
60-
uses: pypa/cibuildwheel@v3.0.1
60+
uses: pypa/cibuildwheel@v3.1.3
6161
with:
6262
output-dir: wheelhouse
6363
config-file: "pyproject.toml"

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ else
2828
TEST_RUNNER = poetry run
2929
endif
3030

31-
POETRY_VERSION = 2.1.1
31+
POETRY_VERSION = 2.1.3
3232

3333
# ============
3434
# Help Section

mkdocs/docs/configuration.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ The RESTCatalog supports pluggable authentication via the `auth` configuration b
389389
- `noop`: No authentication (no Authorization header sent).
390390
- `basic`: HTTP Basic authentication.
391391
- `custom`: Custom authentication manager (requires `auth.impl`).
392+
- `google`: Google Authentication support
392393

393394
###### Configuration Properties
394395

@@ -414,6 +415,7 @@ catalog:
414415
| `auth.impl` | Conditionally | The fully qualified class path for a custom AuthManager. Required if `auth.type` is `custom`. |
415416
| `auth.basic` | If type is `basic` | Block containing `username` and `password` for HTTP Basic authentication. |
416417
| `auth.custom` | If type is `custom` | Block containing configuration for the custom AuthManager. |
418+
| `auth.google` | If type is `google` | Block containing `credentials_path` to a service account file (if using). Will default to using Application Default Credentials. |
417419

418420
###### Examples
419421

poetry.lock

Lines changed: 0 additions & 6595 deletions
This file was deleted.

pyiceberg/catalog/rest/auth.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717

1818
import base64
1919
import importlib
20+
import logging
2021
from abc import ABC, abstractmethod
21-
from typing import Any, Dict, Optional, Type
22+
from typing import Any, Dict, List, Optional, Type
2223

2324
from requests import HTTPError, PreparedRequest, Session
2425
from requests.auth import AuthBase
@@ -27,6 +28,7 @@
2728
from pyiceberg.exceptions import OAuthError
2829

2930
COLON = ":"
31+
logger = logging.getLogger(__name__)
3032

3133

3234
class AuthManager(ABC):
@@ -119,6 +121,35 @@ def auth_header(self) -> str:
119121
return f"Bearer {self._token}"
120122

121123

124+
class GoogleAuthManager(AuthManager):
125+
"""An auth manager that is responsible for handling Google credentials."""
126+
127+
def __init__(self, credentials_path: Optional[str] = None, scopes: Optional[List[str]] = None):
128+
"""
129+
Initialize GoogleAuthManager.
130+
131+
Args:
132+
credentials_path: Optional path to Google credentials JSON file.
133+
scopes: Optional list of OAuth2 scopes.
134+
"""
135+
try:
136+
import google.auth
137+
import google.auth.transport.requests
138+
except ImportError as e:
139+
raise ImportError("Google Auth libraries not found. Please install 'google-auth'.") from e
140+
141+
if credentials_path:
142+
self.credentials, _ = google.auth.load_credentials_from_file(credentials_path, scopes=scopes)
143+
else:
144+
logger.info("Using Google Default Application Credentials")
145+
self.credentials, _ = google.auth.default(scopes=scopes)
146+
self._auth_request = google.auth.transport.requests.Request()
147+
148+
def auth_header(self) -> str:
149+
self.credentials.refresh(self._auth_request)
150+
return f"Bearer {self.credentials.token}"
151+
152+
122153
class AuthManagerAdapter(AuthBase):
123154
"""A `requests.auth.AuthBase` adapter that integrates an `AuthManager` into a `requests.Session` to automatically attach the appropriate Authorization header to every request.
124155
@@ -197,3 +228,4 @@ def create(cls, class_or_name: str, config: Dict[str, Any]) -> AuthManager:
197228
AuthManagerFactory.register("noop", NoopAuthManager)
198229
AuthManagerFactory.register("basic", BasicAuthManager)
199230
AuthManagerFactory.register("legacyoauth2", LegacyOAuth2AuthManager)
231+
AuthManagerFactory.register("google", GoogleAuthManager)

pyiceberg/expressions/visitors.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,7 @@ class _ColumnNameTranslator(BooleanExpressionVisitor[BooleanExpression]):
861861
Args:
862862
file_schema (Schema): The schema of the file.
863863
case_sensitive (bool): Whether to consider case when binding a reference to a field in a schema, defaults to True.
864+
projected_field_values (Dict[int, Any]): Values for projected fields not present in the data file.
864865
865866
Raises:
866867
TypeError: In the case of an UnboundPredicate.
@@ -869,10 +870,12 @@ class _ColumnNameTranslator(BooleanExpressionVisitor[BooleanExpression]):
869870

870871
file_schema: Schema
871872
case_sensitive: bool
873+
projected_field_values: Dict[int, Any]
872874

873-
def __init__(self, file_schema: Schema, case_sensitive: bool) -> None:
875+
def __init__(self, file_schema: Schema, case_sensitive: bool, projected_field_values: Dict[int, Any] = EMPTY_DICT) -> None:
874876
self.file_schema = file_schema
875877
self.case_sensitive = case_sensitive
878+
self.projected_field_values = projected_field_values
876879

877880
def visit_true(self) -> BooleanExpression:
878881
return AlwaysTrue()
@@ -894,12 +897,12 @@ def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpr
894897

895898
def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpression:
896899
field = predicate.term.ref().field
897-
file_column_name = self.file_schema.find_column_name(field.field_id)
900+
field_id = field.field_id
901+
file_column_name = self.file_schema.find_column_name(field_id)
898902

899903
if file_column_name is None:
900-
# In the case of schema evolution, the column might not be present
901-
# we can use the default value as a constant and evaluate it against
902-
# the predicate
904+
# In the case of schema evolution or column projection, the field might not be present in the file schema.
905+
# we can use the projected value or the field's default value as a constant and evaluate it against the predicate
903906
pred: BooleanExpression
904907
if isinstance(predicate, BoundUnaryPredicate):
905908
pred = predicate.as_unbound(field.name)
@@ -910,6 +913,16 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi
910913
else:
911914
raise ValueError(f"Unsupported predicate: {predicate}")
912915

916+
# In the order described by the "Column Projection" section of the Iceberg spec:
917+
# https://iceberg.apache.org/spec/#column-projection
918+
# Evaluate column projection first if it exists
919+
if field_id in self.projected_field_values:
920+
if expression_evaluator(Schema(field), pred, case_sensitive=self.case_sensitive)(
921+
Record(self.projected_field_values[field_id])
922+
):
923+
return AlwaysTrue()
924+
925+
# Evaluate initial_default value
913926
return (
914927
AlwaysTrue()
915928
if expression_evaluator(Schema(field), pred, case_sensitive=self.case_sensitive)(Record(field.initial_default))
@@ -926,8 +939,10 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi
926939
raise ValueError(f"Unsupported predicate: {predicate}")
927940

928941

929-
def translate_column_names(expr: BooleanExpression, file_schema: Schema, case_sensitive: bool) -> BooleanExpression:
930-
return visit(expr, _ColumnNameTranslator(file_schema, case_sensitive))
942+
def translate_column_names(
943+
expr: BooleanExpression, file_schema: Schema, case_sensitive: bool, projected_field_values: Dict[int, Any] = EMPTY_DICT
944+
) -> BooleanExpression:
945+
return visit(expr, _ColumnNameTranslator(file_schema, case_sensitive, projected_field_values))
931946

932947

933948
class _ExpressionFieldIDs(BooleanExpressionVisitor[Set[int]]):

pyiceberg/io/pyarrow.py

Lines changed: 31 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@
131131
)
132132
from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec, partition_record_value
133133
from pyiceberg.schema import (
134-
Accessor,
135134
PartnerAccessor,
136135
PreOrderSchemaVisitor,
137136
Schema,
@@ -1402,41 +1401,23 @@ def _field_id(self, field: pa.Field) -> int:
14021401

14031402
def _get_column_projection_values(
14041403
file: DataFile, projected_schema: Schema, partition_spec: Optional[PartitionSpec], file_project_field_ids: Set[int]
1405-
) -> Tuple[bool, Dict[str, Any]]:
1404+
) -> Dict[int, Any]:
14061405
"""Apply Column Projection rules to File Schema."""
14071406
project_schema_diff = projected_schema.field_ids.difference(file_project_field_ids)
1408-
should_project_columns = len(project_schema_diff) > 0
1409-
projected_missing_fields: Dict[str, Any] = {}
1407+
if len(project_schema_diff) == 0 or partition_spec is None:
1408+
return EMPTY_DICT
14101409

1411-
if not should_project_columns:
1412-
return False, {}
1413-
1414-
partition_schema: StructType
1415-
accessors: Dict[int, Accessor]
1416-
1417-
if partition_spec is not None:
1418-
partition_schema = partition_spec.partition_type(projected_schema)
1419-
accessors = build_position_accessors(partition_schema)
1420-
else:
1421-
return False, {}
1410+
partition_schema = partition_spec.partition_type(projected_schema)
1411+
accessors = build_position_accessors(partition_schema)
14221412

1413+
projected_missing_fields = {}
14231414
for field_id in project_schema_diff:
14241415
for partition_field in partition_spec.fields_by_source_id(field_id):
14251416
if isinstance(partition_field.transform, IdentityTransform):
1426-
accessor = accessors.get(partition_field.field_id)
1417+
if partition_value := accessors[partition_field.field_id].get(file.partition):
1418+
projected_missing_fields[field_id] = partition_value
14271419

1428-
if accessor is None:
1429-
continue
1430-
1431-
# The partition field may not exist in the partition record of the data file.
1432-
# This can happen when new partition fields are introduced after the file was written.
1433-
try:
1434-
if partition_value := accessor.get(file.partition):
1435-
projected_missing_fields[partition_field.name] = partition_value
1436-
except IndexError:
1437-
continue
1438-
1439-
return True, projected_missing_fields
1420+
return projected_missing_fields
14401421

14411422

14421423
def _task_to_record_batches(
@@ -1460,18 +1441,19 @@ def _task_to_record_batches(
14601441
# the table format version.
14611442
file_schema = pyarrow_to_schema(physical_schema, name_mapping, downcast_ns_timestamp_to_us=True)
14621443

1444+
# Apply column projection rules: https://iceberg.apache.org/spec/#column-projection
1445+
projected_missing_fields = _get_column_projection_values(
1446+
task.file, projected_schema, partition_spec, file_schema.field_ids
1447+
)
1448+
14631449
pyarrow_filter = None
14641450
if bound_row_filter is not AlwaysTrue():
1465-
translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive)
1451+
translated_row_filter = translate_column_names(
1452+
bound_row_filter, file_schema, case_sensitive=case_sensitive, projected_field_values=projected_missing_fields
1453+
)
14661454
bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
14671455
pyarrow_filter = expression_to_pyarrow(bound_file_filter)
14681456

1469-
# Apply column projection rules
1470-
# https://iceberg.apache.org/spec/#column-projection
1471-
should_project_columns, projected_missing_fields = _get_column_projection_values(
1472-
task.file, projected_schema, partition_spec, file_schema.field_ids
1473-
)
1474-
14751457
file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
14761458

14771459
fragment_scanner = ds.Scanner.from_fragment(
@@ -1510,23 +1492,14 @@ def _task_to_record_batches(
15101492

15111493
current_batch = table.combine_chunks().to_batches()[0]
15121494

1513-
result_batch = _to_requested_schema(
1495+
yield _to_requested_schema(
15141496
projected_schema,
15151497
file_project_schema,
15161498
current_batch,
15171499
downcast_ns_timestamp_to_us=True,
1500+
projected_missing_fields=projected_missing_fields,
15181501
)
15191502

1520-
# Inject projected column values if available
1521-
if should_project_columns:
1522-
for name, value in projected_missing_fields.items():
1523-
index = result_batch.schema.get_field_index(name)
1524-
if index != -1:
1525-
arr = pa.repeat(value.value(), result_batch.num_rows)
1526-
result_batch = result_batch.set_column(index, name, arr)
1527-
1528-
yield result_batch
1529-
15301503

15311504
def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]:
15321505
deletes_per_file: Dict[str, List[ChunkedArray]] = {}
@@ -1694,7 +1667,7 @@ def _record_batches_from_scan_tasks_and_deletes(
16941667
deletes_per_file.get(task.file.file_path),
16951668
self._case_sensitive,
16961669
self._table_metadata.name_mapping(),
1697-
self._table_metadata.spec(),
1670+
self._table_metadata.specs().get(task.file.spec_id),
16981671
)
16991672
for batch in batches:
17001673
if self._limit is not None:
@@ -1712,12 +1685,15 @@ def _to_requested_schema(
17121685
batch: pa.RecordBatch,
17131686
downcast_ns_timestamp_to_us: bool = False,
17141687
include_field_ids: bool = False,
1688+
projected_missing_fields: Dict[int, Any] = EMPTY_DICT,
17151689
) -> pa.RecordBatch:
17161690
# We could reuse some of these visitors
17171691
struct_array = visit_with_partner(
17181692
requested_schema,
17191693
batch,
1720-
ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids),
1694+
ArrowProjectionVisitor(
1695+
file_schema, downcast_ns_timestamp_to_us, include_field_ids, projected_missing_fields=projected_missing_fields
1696+
),
17211697
ArrowAccessor(file_schema),
17221698
)
17231699
return pa.RecordBatch.from_struct_array(struct_array)
@@ -1728,18 +1704,21 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Arra
17281704
_include_field_ids: bool
17291705
_downcast_ns_timestamp_to_us: bool
17301706
_use_large_types: Optional[bool]
1707+
_projected_missing_fields: Dict[int, Any]
17311708

17321709
def __init__(
17331710
self,
17341711
file_schema: Schema,
17351712
downcast_ns_timestamp_to_us: bool = False,
17361713
include_field_ids: bool = False,
17371714
use_large_types: Optional[bool] = None,
1715+
projected_missing_fields: Dict[int, Any] = EMPTY_DICT,
17381716
) -> None:
17391717
self._file_schema = file_schema
17401718
self._include_field_ids = include_field_ids
17411719
self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
17421720
self._use_large_types = use_large_types
1721+
self._projected_missing_fields = projected_missing_fields
17431722

17441723
if use_large_types is not None:
17451724
deprecation_message(
@@ -1819,10 +1798,12 @@ def struct(
18191798
elif field.optional or field.initial_default is not None:
18201799
# When an optional field is added, or when a required field with a non-null initial default is added
18211800
arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)
1822-
if field.initial_default is None:
1801+
if projected_value := self._projected_missing_fields.get(field.field_id):
1802+
field_arrays.append(pa.repeat(pa.scalar(projected_value, type=arrow_type), len(struct_array)))
1803+
elif field.initial_default is None:
18231804
field_arrays.append(pa.nulls(len(struct_array), type=arrow_type))
18241805
else:
1825-
field_arrays.append(pa.repeat(field.initial_default, len(struct_array)))
1806+
field_arrays.append(pa.repeat(pa.scalar(field.initial_default, type=arrow_type), len(struct_array)))
18261807
fields.append(self._construct_field(field, arrow_type))
18271808
else:
18281809
raise ResolveError(f"Field is required, and could not be found in the file: {field}")

pyiceberg/manifest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -900,6 +900,14 @@ def _convert_entry(entry: Any) -> ManifestEntry:
900900
if not discard_deleted or entry.status != ManifestEntryStatus.DELETED
901901
]
902902

903+
def __eq__(self, other: Any) -> bool:
904+
"""Return the equality of two instances of the ManifestFile class."""
905+
return self.manifest_path == other.manifest_path if isinstance(other, ManifestFile) else False
906+
907+
def __hash__(self) -> int:
908+
"""Return the hash of manifest_path."""
909+
return hash(self.manifest_path)
910+
903911

904912
@cached(cache=LRUCache(maxsize=128), key=lambda io, manifest_list: hashkey(manifest_list))
905913
def _manifests(io: FileIO, manifest_list: str) -> Tuple[ManifestFile, ...]:

0 commit comments

Comments
 (0)