Fix ML fit ordering issue with partial mode and eval data.

google-labs-jules[bot] · google-labs-jules[bot] · commit c1a804e93179 · 2025-12-23T20:59:37.000Z
Modified `bigframes.ml.utils.combine_training_and_evaluation_data` to:
1. Join training `X` and `y` into a single DataFrame (and similarly for eval data) before concatenation. This ensures row identity/alignment is preserved through the concat operation, resolving issues where separate concats could drift apart in `ordering_mode="partial"`.
2. Operate on copies of input DataFrames to avoid side-effects (mutating user's input).
3. Safely handle column name collisions between `X` and `y` by temporarily renaming `y` columns during the join/merge process.

Updated `tests/system/large/ml/test_linear_model.py`:
- Parameterized `test_linear_regression_configure_fit_with_eval_score` to run with both `penguins_df_default_index` and `penguins_df_null_index` fixtures.
- This ensures the fix is robust against different index configurations (default sequential vs potential null/arbitrary indices).

This change fixes a bug where providing validation data to `fit()` could fail or produce incorrect results when using partial ordering mode.
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import pandas as pd
+import pytest
 
 from bigframes.ml import model_selection
 import bigframes.ml.linear_model
@@ -61,12 +62,20 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
     assert reloaded_model.tol == 0.01
 
 
+@pytest.mark.parametrize(
+    "df_fixture",
+    [
+        "penguins_df_default_index",
+        "penguins_df_null_index",
+    ],
+)
 def test_linear_regression_configure_fit_with_eval_score(
-    penguins_df_default_index, dataset_id
+    df_fixture, dataset_id, request
 ):
+    df = request.getfixturevalue(df_fixture)
     model = bigframes.ml.linear_model.LinearRegression()
 
-    df = penguins_df_default_index.dropna()
+    df = df.dropna()
     X = df[
         [
             "species",
@@ -109,7 +118,7 @@ def test_linear_regression_configure_fit_with_eval_score(
     assert reloaded_model.tol == 0.01
 
     # make sure the bqml model was internally created with custom split
-    bq_model = penguins_df_default_index._session.bqclient.get_model(bq_model_name)
+    bq_model = df._session.bqclient.get_model(bq_model_name)
     last_fitting = bq_model.training_runs[-1]["trainingOptions"]
     assert last_fitting["dataSplitMethod"] == "CUSTOM"
     assert "dataSplitColumn" in last_fitting