diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 8ff299ce6a..62de9a4d2e 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -579,7 +579,9 @@ def overwrite( self.table_metadata.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us ) - self.delete(delete_filter=overwrite_filter, case_sensitive=case_sensitive, snapshot_properties=snapshot_properties) + if overwrite_filter != AlwaysFalse(): + # Only delete when the filter is != AlwaysFalse + self.delete(delete_filter=overwrite_filter, case_sensitive=case_sensitive, snapshot_properties=snapshot_properties) with self._append_snapshot_producer(snapshot_properties) as append_files: # skip writing data files if the dataframe is empty diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py index c12351d45c..6cbafcd093 100644 --- a/pyiceberg/table/upsert_util.py +++ b/pyiceberg/table/upsert_util.py @@ -67,6 +67,10 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols non_key_cols = list(all_columns - join_cols_set) + if len(target_table) == 0: + # When the target table is empty, there is nothing to update :) + return source_table.schema.empty_table() + match_expr = functools.reduce(operator.and_, [pc.field(col).isin(target_table.column(col).to_pylist()) for col in join_cols]) matching_source_rows = source_table.filter(match_expr) diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py index c97015e650..d8e8b8c280 100644 --- a/tests/table/test_upsert.py +++ b/tests/table/test_upsert.py @@ -371,6 +371,42 @@ def test_upsert_with_identifier_fields(catalog: Catalog) -> None: assert upd.rows_inserted == 1 +def test_upsert_into_empty_table(catalog: Catalog) -> None: + identifier = "default.test_upsert_into_empty_table" + _drop_table(catalog, identifier) + + schema = Schema( + NestedField(1, "city", StringType(), required=True), + NestedField(2, "inhabitants", IntegerType(), required=True), + # Mark City as the identifier field, also known as the primary-key + identifier_field_ids=[1], + ) + + tbl = catalog.create_table(identifier, schema=schema) + + arrow_schema = pa.schema( + [ + pa.field("city", pa.string(), nullable=False), + pa.field("inhabitants", pa.int32(), nullable=False), + ] + ) + + # Write some data + df = pa.Table.from_pylist( + [ + {"city": "Amsterdam", "inhabitants": 921402}, + {"city": "San Francisco", "inhabitants": 808988}, + {"city": "Drachten", "inhabitants": 45019}, + {"city": "Paris", "inhabitants": 2103000}, + ], + schema=arrow_schema, + ) + upd = tbl.upsert(df) + + assert upd.rows_updated == 0 + assert upd.rows_inserted == 4 + + def test_create_match_filter_single_condition() -> None: """ Test create_match_filter with a composite key where the source yields exactly one unique key.