test: restore original udf in the remote function test, use supported type in the test data (#996)

chelsea-lin · shobsi · web-flow · commit a70d683c909d · 2024-10-07T13:57:44.000-07:00
* test: fix test_df_apply_axis_1_complex by converting numpy value * undo all changes * improve numpy value handling in gcf code * enable the multiindex axis=1 test back * nit reword comment * Revert "improve numpy value handling in gcf code" This reverts commit f549683. --------- Co-authored-by: Shobhit Singh <shobs@google.com>
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
@@ -1670,7 +1670,11 @@ def analyze(row):
                     (3, 4): ["pq", "rs", "tu"],
                     (5.0, "six", 7): [8, 9, 10],
                     'raise Exception("hacked!")': [11, 12, 13],
-                }
+                },
+                # Default pandas index has non-numpy type, whereas bigframes is
+                # always numpy-based type, so let's use the index compatible
+                # with bigframes. See more details in b/369689696.
+                index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()),
             ),
             id="all-kinds-of-column-names",
         ),
@@ -1681,17 +1685,22 @@ def analyze(row):
                     "y": [1.5, 3.75, 5],
                     "z": ["pq", "rs", "tu"],
                 },
-                index=pandas.MultiIndex.from_tuples(
-                    [
-                        ("a", 100),
-                        ("a", 200),
-                        ("b", 300),
-                    ]
+                index=pandas.MultiIndex.from_frame(
+                    pandas.DataFrame(
+                        {
+                            "idx0": pandas.Series(
+                                ["a", "a", "b"], dtype=pandas.StringDtype()
+                            ),
+                            "idx1": pandas.Series(
+                                [100, 200, 300], dtype=pandas.Int64Dtype()
+                            ),
+                        }
+                    )
                 ),
             ),
             id="multiindex",
             marks=pytest.mark.skip(
-                reason="TODO(b/368639580) revert this skip after fix"
+                reason="TODO: revert this skip after this pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/59908"
             ),
         ),
         pytest.param(
@@ -1701,6 +1710,10 @@ def analyze(row):
                     [20, 3.75, "rs"],
                     [30, 8.0, "tu"],
                 ],
+                # Default pandas index has non-numpy type, whereas bigframes is
+                # always numpy-based type, so let's use the index compatible
+                # with bigframes. See more details in b/369689696.
+                index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()),
                 columns=pandas.MultiIndex.from_arrays(
                     [
                         ["first", "last_two", "last_two"],
@@ -1729,10 +1742,8 @@ def test_df_apply_axis_1_complex(session, pd_df):
 
         def serialize_row(row):
             custom = {
-                "name": row.name.item() if hasattr(row.name, "item") else row.name,
-                "index": [
-                    idx.item() if hasattr(idx, "item") else idx for idx in row.index
-                ],
+                "name": row.name,
+                "index": [idx for idx in row.index],
                 "values": [
                     val.item() if hasattr(val, "item") else val for val in row.values
                 ],
@@ -1756,12 +1767,7 @@ def serialize_row(row):
         bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas()
         pd_result = pd_df.apply(serialize_row, axis=1)
 
-        # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
-        # , ignore this mismatch by using check_dtype=False.
-        #
-        # bf_result.index[0].dtype is 'string[pyarrow]' while
-        # pd_result.index[0].dtype is 'object', ignore this mismatch by using
-        # check_index_type=False.
+        # ignore known dtype difference between pandas and bigframes
         pandas.testing.assert_series_equal(
             pd_result, bf_result, check_dtype=False, check_index_type=False
         )