apache
diff --git a/‎.github/workflows/pypi-build-artifacts.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pypi-build-artifacts.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/python-ci.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/python-ci.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/svn-build-artifacts.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/svn-build-artifacts.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mkdocs/docs/configuration.md‎
Lines changed: 2 additions & 0 deletions b/‎mkdocs/docs/configuration.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 0 additions & 6595 deletions b/‎poetry.lock‎
Lines changed: 0 additions & 6595 deletions
diff --git a/‎pyiceberg/catalog/rest/auth.py‎
Lines changed: 33 additions & 1 deletion b/‎pyiceberg/catalog/rest/auth.py‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎pyiceberg/expressions/visitors.py‎
Lines changed: 22 additions & 7 deletions b/‎pyiceberg/expressions/visitors.py‎
Lines changed: 22 additions & 7 deletions
diff --git a/‎pyiceberg/io/pyarrow.py‎
Lines changed: 31 additions & 50 deletions b/‎pyiceberg/io/pyarrow.py‎
Lines changed: 31 additions & 50 deletions
diff --git a/‎pyiceberg/manifest.py‎
Lines changed: 8 additions & 0 deletions b/‎pyiceberg/manifest.py‎
Lines changed: 8 additions & 0 deletions
@@ -62,7 +62,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.0.1
+        uses: pypa/cibuildwheel@v3.1.3
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"
 
@@ -56,8 +56,6 @@ jobs:
     - uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python }}
-        cache: poetry
-        cache-dependency-path: ./poetry.lock
     - name: Install system dependencies
       run: sudo apt-get update && sudo apt-get install -y libkrb5-dev # for kerberos
     - name: Install
 
@@ -57,7 +57,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.0.1
+        uses: pypa/cibuildwheel@v3.1.3
         with:
           output-dir: wheelhouse
           config-file: "pyproject.toml"
 
@@ -28,7 +28,7 @@ else
   TEST_RUNNER = poetry run
 endif
 
-POETRY_VERSION = 2.1.1
+POETRY_VERSION = 2.1.3
 
 # ============
 # Help Section
 
@@ -389,6 +389,7 @@ The RESTCatalog supports pluggable authentication via the `auth` configuration b
 - `noop`: No authentication (no Authorization header sent).
 - `basic`: HTTP Basic authentication.
 - `custom`: Custom authentication manager (requires `auth.impl`).
+- `google`: Google Authentication support
 
 ###### Configuration Properties
 
@@ -414,6 +415,7 @@ catalog:
 | `auth.impl`      | Conditionally | The fully qualified class path for a custom AuthManager. Required if `auth.type` is `custom`. |
 | `auth.basic`     | If type is `basic` | Block containing `username` and `password` for HTTP Basic authentication.           |
 | `auth.custom`    | If type is `custom` | Block containing configuration for the custom AuthManager.                          |
+| `auth.google`    | If type is `google` | Block containing `credentials_path` to a service account file (if using). Will default to using Application Default Credentials. |
 
 ###### Examples
 
 
@@ -17,8 +17,9 @@
 
 import base64
 import importlib
+import logging
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, List, Optional, Type
 
 from requests import HTTPError, PreparedRequest, Session
 from requests.auth import AuthBase
@@ -27,6 +28,7 @@
 from pyiceberg.exceptions import OAuthError
 
 COLON = ":"
+logger = logging.getLogger(__name__)
 
 
 class AuthManager(ABC):
@@ -119,6 +121,35 @@ def auth_header(self) -> str:
         return f"Bearer {self._token}"
 
 
+class GoogleAuthManager(AuthManager):
+    """An auth manager that is responsible for handling Google credentials."""
+
+    def __init__(self, credentials_path: Optional[str] = None, scopes: Optional[List[str]] = None):
+        """
+        Initialize GoogleAuthManager.
+
+        Args:
+            credentials_path: Optional path to Google credentials JSON file.
+            scopes: Optional list of OAuth2 scopes.
+        """
+        try:
+            import google.auth
+            import google.auth.transport.requests
+        except ImportError as e:
+            raise ImportError("Google Auth libraries not found. Please install 'google-auth'.") from e
+
+        if credentials_path:
+            self.credentials, _ = google.auth.load_credentials_from_file(credentials_path, scopes=scopes)
+        else:
+            logger.info("Using Google Default Application Credentials")
+            self.credentials, _ = google.auth.default(scopes=scopes)
+        self._auth_request = google.auth.transport.requests.Request()
+
+    def auth_header(self) -> str:
+        self.credentials.refresh(self._auth_request)
+        return f"Bearer {self.credentials.token}"
+
+
 class AuthManagerAdapter(AuthBase):
     """A `requests.auth.AuthBase` adapter that integrates an `AuthManager` into a `requests.Session` to automatically attach the appropriate Authorization header to every request.
 
@@ -197,3 +228,4 @@ def create(cls, class_or_name: str, config: Dict[str, Any]) -> AuthManager:
 AuthManagerFactory.register("noop", NoopAuthManager)
 AuthManagerFactory.register("basic", BasicAuthManager)
 AuthManagerFactory.register("legacyoauth2", LegacyOAuth2AuthManager)
+AuthManagerFactory.register("google", GoogleAuthManager)
@@ -861,6 +861,7 @@ class _ColumnNameTranslator(BooleanExpressionVisitor[BooleanExpression]):
     Args:
       file_schema (Schema): The schema of the file.
       case_sensitive (bool): Whether to consider case when binding a reference to a field in a schema, defaults to True.
+      projected_field_values (Dict[int, Any]): Values for projected fields not present in the data file.
 
     Raises:
         TypeError: In the case of an UnboundPredicate.
@@ -869,10 +870,12 @@ class _ColumnNameTranslator(BooleanExpressionVisitor[BooleanExpression]):
 
     file_schema: Schema
     case_sensitive: bool
+    projected_field_values: Dict[int, Any]
 
-    def __init__(self, file_schema: Schema, case_sensitive: bool) -> None:
+    def __init__(self, file_schema: Schema, case_sensitive: bool, projected_field_values: Dict[int, Any] = EMPTY_DICT) -> None:
         self.file_schema = file_schema
         self.case_sensitive = case_sensitive
+        self.projected_field_values = projected_field_values
 
     def visit_true(self) -> BooleanExpression:
         return AlwaysTrue()
@@ -894,12 +897,12 @@ def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpr
 
     def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpression:
         field = predicate.term.ref().field
-        file_column_name = self.file_schema.find_column_name(field.field_id)
+        field_id = field.field_id
+        file_column_name = self.file_schema.find_column_name(field_id)
 
         if file_column_name is None:
-            # In the case of schema evolution, the column might not be present
-            # we can use the default value as a constant and evaluate it against
-            # the predicate
+            # In the case of schema evolution or column projection, the field might not be present in the file schema.
+            # we can use the projected value or the field's default value as a constant and evaluate it against the predicate
             pred: BooleanExpression
             if isinstance(predicate, BoundUnaryPredicate):
                 pred = predicate.as_unbound(field.name)
@@ -910,6 +913,16 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi
             else:
                 raise ValueError(f"Unsupported predicate: {predicate}")
 
+            # In the order described by the "Column Projection" section of the Iceberg spec:
+            # https://iceberg.apache.org/spec/#column-projection
+            # Evaluate column projection first if it exists
+            if field_id in self.projected_field_values:
+                if expression_evaluator(Schema(field), pred, case_sensitive=self.case_sensitive)(
+                    Record(self.projected_field_values[field_id])
+                ):
+                    return AlwaysTrue()
+
+            # Evaluate initial_default value
             return (
                 AlwaysTrue()
                 if expression_evaluator(Schema(field), pred, case_sensitive=self.case_sensitive)(Record(field.initial_default))
@@ -926,8 +939,10 @@ def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpressi
             raise ValueError(f"Unsupported predicate: {predicate}")
 
 
-def translate_column_names(expr: BooleanExpression, file_schema: Schema, case_sensitive: bool) -> BooleanExpression:
-    return visit(expr, _ColumnNameTranslator(file_schema, case_sensitive))
+def translate_column_names(
+    expr: BooleanExpression, file_schema: Schema, case_sensitive: bool, projected_field_values: Dict[int, Any] = EMPTY_DICT
+) -> BooleanExpression:
+    return visit(expr, _ColumnNameTranslator(file_schema, case_sensitive, projected_field_values))
 
 
 class _ExpressionFieldIDs(BooleanExpressionVisitor[Set[int]]):
 
@@ -131,7 +131,6 @@
 )
 from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec, partition_record_value
 from pyiceberg.schema import (
-    Accessor,
     PartnerAccessor,
     PreOrderSchemaVisitor,
     Schema,
@@ -1402,41 +1401,23 @@ def _field_id(self, field: pa.Field) -> int:
 
 def _get_column_projection_values(
     file: DataFile, projected_schema: Schema, partition_spec: Optional[PartitionSpec], file_project_field_ids: Set[int]
-) -> Tuple[bool, Dict[str, Any]]:
+) -> Dict[int, Any]:
     """Apply Column Projection rules to File Schema."""
     project_schema_diff = projected_schema.field_ids.difference(file_project_field_ids)
-    should_project_columns = len(project_schema_diff) > 0
-    projected_missing_fields: Dict[str, Any] = {}
+    if len(project_schema_diff) == 0 or partition_spec is None:
+        return EMPTY_DICT
 
-    if not should_project_columns:
-        return False, {}
-
-    partition_schema: StructType
-    accessors: Dict[int, Accessor]
-
-    if partition_spec is not None:
-        partition_schema = partition_spec.partition_type(projected_schema)
-        accessors = build_position_accessors(partition_schema)
-    else:
-        return False, {}
+    partition_schema = partition_spec.partition_type(projected_schema)
+    accessors = build_position_accessors(partition_schema)
 
+    projected_missing_fields = {}
     for field_id in project_schema_diff:
         for partition_field in partition_spec.fields_by_source_id(field_id):
             if isinstance(partition_field.transform, IdentityTransform):
-                accessor = accessors.get(partition_field.field_id)
+                if partition_value := accessors[partition_field.field_id].get(file.partition):
+                    projected_missing_fields[field_id] = partition_value
 
-                if accessor is None:
-                    continue
-
-                # The partition field may not exist in the partition record of the data file.
-                # This can happen when new partition fields are introduced after the file was written.
-                try:
-                    if partition_value := accessor.get(file.partition):
-                        projected_missing_fields[partition_field.name] = partition_value
-                except IndexError:
-                    continue
-
-    return True, projected_missing_fields
+    return projected_missing_fields
 
 
 def _task_to_record_batches(
@@ -1460,18 +1441,19 @@ def _task_to_record_batches(
         # the table format version.
         file_schema = pyarrow_to_schema(physical_schema, name_mapping, downcast_ns_timestamp_to_us=True)
 
+        # Apply column projection rules: https://iceberg.apache.org/spec/#column-projection
+        projected_missing_fields = _get_column_projection_values(
+            task.file, projected_schema, partition_spec, file_schema.field_ids
+        )
+
         pyarrow_filter = None
         if bound_row_filter is not AlwaysTrue():
-            translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive)
+            translated_row_filter = translate_column_names(
+                bound_row_filter, file_schema, case_sensitive=case_sensitive, projected_field_values=projected_missing_fields
+            )
             bound_file_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive)
             pyarrow_filter = expression_to_pyarrow(bound_file_filter)
 
-        # Apply column projection rules
-        # https://iceberg.apache.org/spec/#column-projection
-        should_project_columns, projected_missing_fields = _get_column_projection_values(
-            task.file, projected_schema, partition_spec, file_schema.field_ids
-        )
-
         file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False)
 
         fragment_scanner = ds.Scanner.from_fragment(
@@ -1510,23 +1492,14 @@ def _task_to_record_batches(
 
                 current_batch = table.combine_chunks().to_batches()[0]
 
-            result_batch = _to_requested_schema(
+            yield _to_requested_schema(
                 projected_schema,
                 file_project_schema,
                 current_batch,
                 downcast_ns_timestamp_to_us=True,
+                projected_missing_fields=projected_missing_fields,
             )
 
-            # Inject projected column values if available
-            if should_project_columns:
-                for name, value in projected_missing_fields.items():
-                    index = result_batch.schema.get_field_index(name)
-                    if index != -1:
-                        arr = pa.repeat(value.value(), result_batch.num_rows)
-                        result_batch = result_batch.set_column(index, name, arr)
-
-            yield result_batch
-
 
 def _read_all_delete_files(io: FileIO, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]:
     deletes_per_file: Dict[str, List[ChunkedArray]] = {}
@@ -1694,7 +1667,7 @@ def _record_batches_from_scan_tasks_and_deletes(
                 deletes_per_file.get(task.file.file_path),
                 self._case_sensitive,
                 self._table_metadata.name_mapping(),
-                self._table_metadata.spec(),
+                self._table_metadata.specs().get(task.file.spec_id),
             )
             for batch in batches:
                 if self._limit is not None:
@@ -1712,12 +1685,15 @@ def _to_requested_schema(
     batch: pa.RecordBatch,
     downcast_ns_timestamp_to_us: bool = False,
     include_field_ids: bool = False,
+    projected_missing_fields: Dict[int, Any] = EMPTY_DICT,
 ) -> pa.RecordBatch:
     # We could reuse some of these visitors
     struct_array = visit_with_partner(
         requested_schema,
         batch,
-        ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids),
+        ArrowProjectionVisitor(
+            file_schema, downcast_ns_timestamp_to_us, include_field_ids, projected_missing_fields=projected_missing_fields
+        ),
         ArrowAccessor(file_schema),
     )
     return pa.RecordBatch.from_struct_array(struct_array)
@@ -1728,18 +1704,21 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Arra
     _include_field_ids: bool
     _downcast_ns_timestamp_to_us: bool
     _use_large_types: Optional[bool]
+    _projected_missing_fields: Dict[int, Any]
 
     def __init__(
         self,
         file_schema: Schema,
         downcast_ns_timestamp_to_us: bool = False,
         include_field_ids: bool = False,
         use_large_types: Optional[bool] = None,
+        projected_missing_fields: Dict[int, Any] = EMPTY_DICT,
     ) -> None:
         self._file_schema = file_schema
         self._include_field_ids = include_field_ids
         self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
         self._use_large_types = use_large_types
+        self._projected_missing_fields = projected_missing_fields
 
         if use_large_types is not None:
             deprecation_message(
@@ -1819,10 +1798,12 @@ def struct(
             elif field.optional or field.initial_default is not None:
                 # When an optional field is added, or when a required field with a non-null initial default is added
                 arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)
-                if field.initial_default is None:
+                if projected_value := self._projected_missing_fields.get(field.field_id):
+                    field_arrays.append(pa.repeat(pa.scalar(projected_value, type=arrow_type), len(struct_array)))
+                elif field.initial_default is None:
                     field_arrays.append(pa.nulls(len(struct_array), type=arrow_type))
                 else:
-                    field_arrays.append(pa.repeat(field.initial_default, len(struct_array)))
+                    field_arrays.append(pa.repeat(pa.scalar(field.initial_default, type=arrow_type), len(struct_array)))
                 fields.append(self._construct_field(field, arrow_type))
             else:
                 raise ResolveError(f"Field is required, and could not be found in the file: {field}")
 
@@ -900,6 +900,14 @@ def _convert_entry(entry: Any) -> ManifestEntry:
             if not discard_deleted or entry.status != ManifestEntryStatus.DELETED
         ]
 
+    def __eq__(self, other: Any) -> bool:
+        """Return the equality of two instances of the ManifestFile class."""
+        return self.manifest_path == other.manifest_path if isinstance(other, ManifestFile) else False
+
+    def __hash__(self) -> int:
+        """Return the hash of manifest_path."""
+        return hash(self.manifest_path)
+
 
 @cached(cache=LRUCache(maxsize=128), key=lambda io, manifest_list: hashkey(manifest_list))
 def _manifests(io: FileIO, manifest_list: str) -> Tuple[ManifestFile, ...]: