@@ -2098,3 +2098,174 @@ def test_branch_py_write_spark_read(session_catalog: Catalog, spark: SparkSessio
20982098 )
20992099 assert main_df .count () == 3
21002100 assert branch_df .count () == 2
2101+
2102+
2103+ @pytest .mark .integration
2104+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
2105+ def test_stage_only_delete (
2106+ spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table , format_version : int
2107+ ) -> None :
2108+ identifier = f"default.test_stage_only_delete_files_v{ format_version } "
2109+ iceberg_spec = PartitionSpec (
2110+ * [PartitionField (source_id = 4 , field_id = 1001 , transform = IdentityTransform (), name = "integer_partition" )]
2111+ )
2112+ tbl = _create_table (
2113+ session_catalog , identifier , {"format-version" : str (format_version )}, [arrow_table_with_null ], iceberg_spec
2114+ )
2115+
2116+ current_snapshot = tbl .metadata .current_snapshot_id
2117+ assert current_snapshot is not None
2118+
2119+ original_count = len (tbl .scan ().to_arrow ())
2120+ assert original_count == 3
2121+
2122+ files_to_delete = []
2123+ for file_task in tbl .scan ().plan_files ():
2124+ files_to_delete .append (file_task .file )
2125+ assert len (files_to_delete ) > 0
2126+
2127+ with tbl .transaction () as txn :
2128+ with txn .update_snapshot (stage_only = True ).delete () as delete :
2129+ delete .delete_by_predicate (EqualTo ("int" , 9 ))
2130+
2131+ # a new delete snapshot is added
2132+ snapshots = tbl .snapshots ()
2133+ assert len (snapshots ) == 2
2134+
2135+ rows = spark .sql (
2136+ f"""
2137+ SELECT operation, summary
2138+ FROM { identifier } .snapshots
2139+ ORDER BY committed_at ASC
2140+ """
2141+ ).collect ()
2142+ operations = [row .operation for row in rows ]
2143+ assert operations == ["append" , "delete" ]
2144+
2145+ # snapshot main ref has not changed
2146+ assert current_snapshot == tbl .metadata .current_snapshot_id
2147+ assert len (tbl .scan ().to_arrow ()) == original_count
2148+
2149+
2150+ @pytest .mark .integration
2151+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
2152+ def test_stage_only_fast_append (
2153+ spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table , format_version : int
2154+ ) -> None :
2155+ identifier = f"default.test_stage_only_fast_append_files_v{ format_version } "
2156+ tbl = _create_table (session_catalog , identifier , {"format-version" : str (format_version )}, [arrow_table_with_null ])
2157+
2158+ current_snapshot = tbl .metadata .current_snapshot_id
2159+ assert current_snapshot is not None
2160+
2161+ original_count = len (tbl .scan ().to_arrow ())
2162+ assert original_count == 3
2163+
2164+ with tbl .transaction () as txn :
2165+ with txn .update_snapshot (stage_only = True ).fast_append () as fast_append :
2166+ for data_file in _dataframe_to_data_files (
2167+ table_metadata = txn .table_metadata , df = arrow_table_with_null , io = txn ._table .io
2168+ ):
2169+ fast_append .append_data_file (data_file = data_file )
2170+
2171+ # Main ref has not changed and data is not yet appended
2172+ assert current_snapshot == tbl .metadata .current_snapshot_id
2173+ assert len (tbl .scan ().to_arrow ()) == original_count
2174+
2175+ # There should be a new staged snapshot
2176+ snapshots = tbl .snapshots ()
2177+ assert len (snapshots ) == 2
2178+
2179+ rows = spark .sql (
2180+ f"""
2181+ SELECT operation, summary
2182+ FROM { identifier } .snapshots
2183+ ORDER BY committed_at ASC
2184+ """
2185+ ).collect ()
2186+ operations = [row .operation for row in rows ]
2187+ assert operations == ["append" , "append" ]
2188+
2189+
2190+ @pytest .mark .integration
2191+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
2192+ def test_stage_only_merge_append (
2193+ spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table , format_version : int
2194+ ) -> None :
2195+ identifier = f"default.test_stage_only_merge_append_files_v{ format_version } "
2196+ tbl = _create_table (session_catalog , identifier , {"format-version" : str (format_version )}, [arrow_table_with_null ])
2197+
2198+ current_snapshot = tbl .metadata .current_snapshot_id
2199+ assert current_snapshot is not None
2200+
2201+ original_count = len (tbl .scan ().to_arrow ())
2202+ assert original_count == 3
2203+
2204+ with tbl .transaction () as txn :
2205+ with txn .update_snapshot (stage_only = True ).merge_append () as merge_append :
2206+ for data_file in _dataframe_to_data_files (
2207+ table_metadata = txn .table_metadata , df = arrow_table_with_null , io = txn ._table .io
2208+ ):
2209+ merge_append .append_data_file (data_file = data_file )
2210+
2211+ # Main ref has not changed and data is not yet appended
2212+ assert current_snapshot == tbl .metadata .current_snapshot_id
2213+ assert len (tbl .scan ().to_arrow ()) == original_count
2214+
2215+ # There should be a new staged snapshot
2216+ snapshots = tbl .snapshots ()
2217+ assert len (snapshots ) == 2
2218+
2219+ rows = spark .sql (
2220+ f"""
2221+ SELECT operation, summary
2222+ FROM { identifier } .snapshots
2223+ ORDER BY committed_at ASC
2224+ """
2225+ ).collect ()
2226+ operations = [row .operation for row in rows ]
2227+ assert operations == ["append" , "append" ]
2228+
2229+
2230+ @pytest .mark .integration
2231+ @pytest .mark .parametrize ("format_version" , [1 , 2 ])
2232+ def test_stage_only_overwrite_files (
2233+ spark : SparkSession , session_catalog : Catalog , arrow_table_with_null : pa .Table , format_version : int
2234+ ) -> None :
2235+ identifier = f"default.test_stage_only_overwrite_files_v{ format_version } "
2236+ tbl = _create_table (session_catalog , identifier , {"format-version" : str (format_version )}, [arrow_table_with_null ])
2237+
2238+ current_snapshot = tbl .metadata .current_snapshot_id
2239+ assert current_snapshot is not None
2240+
2241+ original_count = len (tbl .scan ().to_arrow ())
2242+ assert original_count == 3
2243+
2244+ files_to_delete = []
2245+ for file_task in tbl .scan ().plan_files ():
2246+ files_to_delete .append (file_task .file )
2247+ assert len (files_to_delete ) > 0
2248+
2249+ with tbl .transaction () as txn :
2250+ with txn .update_snapshot (stage_only = True ).overwrite () as overwrite :
2251+ for data_file in _dataframe_to_data_files (
2252+ table_metadata = txn .table_metadata , df = arrow_table_with_null , io = txn ._table .io
2253+ ):
2254+ overwrite .append_data_file (data_file = data_file )
2255+ overwrite .delete_data_file (files_to_delete [0 ])
2256+
2257+ assert current_snapshot == tbl .metadata .current_snapshot_id
2258+ assert len (tbl .scan ().to_arrow ()) == original_count
2259+
2260+ snapshots = tbl .snapshots ()
2261+ assert len (snapshots ) == 2
2262+
2263+ rows = spark .sql (
2264+ f"""
2265+ SELECT operation, summary
2266+ FROM { identifier } .snapshots
2267+ ORDER BY committed_at ASC
2268+ """
2269+ ).collect ()
2270+ operations = [row .operation for row in rows ]
2271+ assert operations == ["append" , "overwrite" ]
0 commit comments