From 91cecfd4fe8fbaca009b14fe82454c42832f83bd Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 16 Sep 2025 23:23:48 +0000 Subject: [PATCH 1/2] fix: Transformers with non-standard column names through errors --- bigframes/ml/compose.py | 4 +-- bigframes/ml/impute.py | 2 ++ bigframes/ml/preprocessing.py | 8 ++++++ tests/system/small/ml/test_preprocessing.py | 32 +++++++++++++++++++++ 4 files changed, 44 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 46d40d5fc8..92c98695cd 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -29,6 +29,7 @@ from bigframes.core import log_adapter import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, impute, preprocessing, utils import bigframes.pandas as bpd @@ -103,13 +104,12 @@ def __init__(self, sql: str, target_column: str = "transformed_{0}"): # TODO: More robust unescaping self._target_column = target_column.replace("`", "") - PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE) - def _compile_to_sql( self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None ) -> List[str]: if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) result = [] for column in columns: current_sql = self._sql.format(sql_utils.identifier(column)) diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index f19c8e2cd3..818151a4f9 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -23,6 +23,7 @@ import bigframes_vendored.sklearn.impute._base from bigframes.core import log_adapter +import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -62,6 +63,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_imputer( column, self.strategy, f"imputer_{column}" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 2e8dc64a53..94c61674f6 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -27,6 +27,7 @@ import bigframes_vendored.sklearn.preprocessing._polynomial from bigframes.core import log_adapter +import bigframes.core.utils as core_utils from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd @@ -59,6 +60,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_standard_scaler( column, f"standard_scaled_{column}" @@ -136,6 +138,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_max_abs_scaler( column, f"max_abs_scaled_{column}" @@ -214,6 +217,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) return [ self._base_sql_generator.ml_min_max_scaler( column, f"min_max_scaled_{column}" @@ -304,6 +308,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) array_split_points = {} if self.strategy == "uniform": for column in columns: @@ -433,6 +438,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) drop = self.drop if self.drop is not None else "none" # minus one here since BQML's implementation always includes index 0, and top_k is on top of that. top_k = ( @@ -547,6 +553,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) # minus one here since BQML's inplimentation always includes index 0, and top_k is on top of that. top_k = ( @@ -644,6 +651,7 @@ def _compile_to_sql( Returns: a list of tuples sql_expr.""" if columns is None: columns = X.columns + columns, _ = core_utils.get_standardized_ids(columns) output_name = "poly_feat" return [ self._base_sql_generator.ml_polynomial_expand( diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 65a851efc3..cca9cda2f5 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -19,6 +19,7 @@ import bigframes.features from bigframes.ml import preprocessing +import bigframes.pandas as bpd from bigframes.testing import utils ONE_HOT_ENCODED_DTYPE = ( @@ -114,6 +115,37 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui pd.testing.assert_frame_equal(result, expected, rtol=0.1) +def test_standard_scaler_normalizeds_non_standard_column_names( + new_penguins_df: bpd.DataFrame, +): + new_penguins_df = new_penguins_df.rename( + columns={ + "culmen_length_mm": "culmen?metric", + "culmen_depth_mm": "culmen/metric", + } + ) + scaler = preprocessing.StandardScaler() + result = scaler.fit_transform( + new_penguins_df[["culmen?metric", "culmen/metric", "flipper_length_mm"]] + ).to_pandas() + + # If standard-scaled correctly, mean should be 0.0 + for column in result.columns: + assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) + + expected = pd.DataFrame( + { + "standard_scaled_culmen_metric": [1.313249, -0.20198, -1.111118], + "standard_scaled_culmen_metric_1": [1.17072, -1.272416, 0.101848], + "standard_scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=0.1) + + def test_standard_scaler_save_load(new_penguins_df, dataset_id): transformer = preprocessing.StandardScaler() transformer.fit( From 532db607b003ed77e6f8651bf67514c31985e124 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 17 Sep 2025 17:44:14 +0000 Subject: [PATCH 2/2] fix --- tests/system/small/ml/test_preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index cca9cda2f5..3280b16f42 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -63,7 +63,7 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=0.1) -def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): +def test_standard_scaler_normalizes_fit_transform(new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. scaler = preprocessing.StandardScaler() result = scaler.fit_transform( @@ -115,7 +115,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui pd.testing.assert_frame_equal(result, expected, rtol=0.1) -def test_standard_scaler_normalizeds_non_standard_column_names( +def test_standard_scaler_normalizes_non_standard_column_names( new_penguins_df: bpd.DataFrame, ): new_penguins_df = new_penguins_df.rename(