From 47bfeb99da5a69749725f2e5a89b56b8f5a7d222 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 23 Nov 2023 02:09:28 +0000 Subject: [PATCH 1/8] feat: support `Series.dot` on a DataFrame input --- bigframes/series.py | 11 +++++++++++ tests/system/small/test_series.py | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index c929775a00..4d1ca0312d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -680,6 +680,17 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore return (self.rfloordiv(other), self.rmod(other)) def __matmul__(self, other): + if isinstance(other, bigframes.dataframe.DataFrame): + return Series( + [ + pandas.NA if other[col].isna().any() else (self * other[col]).sum() + for col in other.columns + ], + index=other.columns, + name=self.name, + ) + + # At this point other must be a Series return (self * other).sum() dot = __matmul__ diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d9fc23fad0..bfff33e5bd 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -22,6 +22,7 @@ import pyarrow as pa # type: ignore import pytest +import bigframes.dataframe as dataframe import bigframes.pandas import bigframes.series as series from tests.system.utils import assert_pandas_df_equal, assert_series_equal @@ -2266,6 +2267,30 @@ def test_dot(scalars_dfs): assert bf_result == pd_result +def test_dot_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"] @ scalars_df[["int64_col", "int64_too"]] + pd_result = ( + scalars_pandas_df["int64_too"] @ scalars_pandas_df[["int64_col", "int64_too"]] + ) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False + ) + + +def test_dot_df_inline(scalars_dfs): + left = [10, 11, 12, 13] # series data + right = [[0, 1], [-2, 3], [4, -5], [6, 7]] # dataframe data + + bf_result = series.Series(left) @ dataframe.DataFrame(right) + pd_result = pd.Series(left) @ pd.DataFrame(right) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("left", "right", "inclusive"), [ From 594c587f4e664b5abcd7c7703109e84552bbd5ed Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 27 Nov 2023 22:54:47 +0000 Subject: [PATCH 2/8] add column multi index test for series.dot(df), refactoring --- tests/system/small/test_multiindex.py | 18 ++++++++++++++++++ tests/system/small/test_series.py | 22 +++++++++------------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index e7e93849c6..b5a024db2b 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1045,3 +1045,21 @@ def test_column_multi_index_dot_not_supported(): NotImplementedError, match="Multi-level column input is not supported" ): bf1 @ bf2 + + +def test_series_dot_df_column_multi_index(): + left = [10, 11, 12, 13] # series data + right = [[0, 1, 2], [-2, 3, -4], [4, -5, 6], [6, 7, -8]] # dataframe data + + multi_level_columns = pandas.MultiIndex.from_arrays( + [["col0", "col0", "col1"], ["col00", "col01", "col11"]] + ) + + bf_result = bpd.Series(left) @ bpd.DataFrame(right, columns=multi_level_columns) + pd_result = pandas.Series(left) @ pandas.DataFrame( + right, columns=multi_level_columns + ) + + pandas.testing.assert_series_equal( + bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index bfff33e5bd..c5664a7981 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -22,7 +22,6 @@ import pyarrow as pa # type: ignore import pytest -import bigframes.dataframe as dataframe import bigframes.pandas import bigframes.series as series from tests.system.utils import assert_pandas_df_equal, assert_series_equal @@ -2267,24 +2266,21 @@ def test_dot(scalars_dfs): assert bf_result == pd_result -def test_dot_df(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df["int64_too"] @ scalars_df[["int64_col", "int64_too"]] - pd_result = ( - scalars_pandas_df["int64_too"] @ scalars_pandas_df[["int64_col", "int64_too"]] - ) +def test_dot_df(matrix_3by4_df, matrix_3by4_pandas_df): + bf_result = matrix_3by4_df["w"] @ matrix_3by4_df + pd_result = matrix_3by4_pandas_df["w"] @ matrix_3by4_pandas_df pd.testing.assert_series_equal( bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False ) -def test_dot_df_inline(scalars_dfs): - left = [10, 11, 12, 13] # series data - right = [[0, 1], [-2, 3], [4, -5], [6, 7]] # dataframe data - - bf_result = series.Series(left) @ dataframe.DataFrame(right) - pd_result = pd.Series(left) @ pd.DataFrame(right) +def test_dot_df_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"] @ scalars_df[["int64_col", "int64_too"]] + pd_result = ( + scalars_pandas_df["int64_too"] @ scalars_pandas_df[["int64_col", "int64_too"]] + ) pd.testing.assert_series_equal( bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False From f794ff698e935759be5a04c8b2bb32cdf0837026 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 27 Nov 2023 23:15:20 +0000 Subject: [PATCH 3/8] update Series.dot docstring with DataFrame support --- .../bigframes_vendored/pandas/core/series.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1b751ed83b..4da167ef02 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -620,8 +620,7 @@ def dot(self, other) -> Series | np.ndarray: Compute the dot product between the Series and the columns of other. This method computes the dot product between the Series and another - one, or the Series and each columns of a DataFrame, or the Series and - each columns of an array. + one, or the Series and each columns of a DataFrame. It can also be called using `self @ other` in Python >= 3.5. @@ -646,15 +645,25 @@ def dot(self, other) -> Series | np.ndarray: >>> s @ other 8 + The other operand can be a DataFrame: + + >>> other = bpd.DataFrame({"a" : [-1, 2, -3, 4], + ... "b" : [-10, 20, -30, 40], + ... "c" : [-1, 2, -3, bpd.NA]}) + >>> s @ other + a 8.0 + b 80.0 + c + dtype: Float64 + Args: - other (Series): + other (Series, or DataFrame): The other object to compute the dot product with its columns. Returns: - scalar, Series or numpy.ndarray: Return the dot product of the Series - and other if other is a Series, the Series of the dot product of - Series and each rows of other if other is a DataFrame or a - numpy.ndarray between the Series and each columns of the numpy array. + scalar, Series: Return the dot product of the Series + and other if other is a Series, or the Series of the dot product + of Series and each column of other if other is a DataFrame. """ From e41e83d2590c0932d415b2ca6cfaf571df5a88ea Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 29 Nov 2023 07:11:07 +0000 Subject: [PATCH 4/8] optimize Series.dot for dataframe with single level columns --- bigframes/series.py | 24 ++++++++++++++++++++---- tests/system/small/test_multiindex.py | 13 +++++++++---- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 4d1ca0312d..451049be7d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -680,8 +680,25 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore return (self.rfloordiv(other), self.rmod(other)) def __matmul__(self, other): - if isinstance(other, bigframes.dataframe.DataFrame): - return Series( + if isinstance(other, Series): + return (self * other).sum() + + # At this point other must be a DataFrame + if len(other.columns.names) == 1: + # Single level columns in other + na_df = other.isna().any() + mul_df = Series( + [(self * other[col]).sum() for col in other.columns], + index=other.columns, + name=self.name, + ) + result = mul_df.mask(na_df) + else: + # Multi level columns in other + # TODO(b/313747368): Remove this once DataFrame.any() honors + # multi-level index, as the logic in the if clause should generalize + # for multi-level columns in other + result = Series( [ pandas.NA if other[col].isna().any() else (self * other[col]).sum() for col in other.columns @@ -690,8 +707,7 @@ def __matmul__(self, other): name=self.name, ) - # At this point other must be a Series - return (self * other).sum() + return result dot = __matmul__ diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b5a024db2b..3410c39e74 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1055,10 +1055,15 @@ def test_series_dot_df_column_multi_index(): [["col0", "col0", "col1"], ["col00", "col01", "col11"]] ) - bf_result = bpd.Series(left) @ bpd.DataFrame(right, columns=multi_level_columns) - pd_result = pandas.Series(left) @ pandas.DataFrame( - right, columns=multi_level_columns - ) + bf_left_s = bpd.Series(left) + bf_right_df = bpd.DataFrame(right) + bf_right_df.columns = multi_level_columns + bf_result = bf_left_s @ bf_right_df + + pd_left_s = pandas.Series(left) + pd_right_df = pandas.DataFrame(right) + pd_right_df.columns = multi_level_columns + pd_result = pd_left_s @ pd_right_df pandas.testing.assert_series_equal( bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False From 403b66f5ad2394a62fd3e02c536d79516e92ea29 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Sun, 27 Jul 2025 08:56:30 +0000 Subject: [PATCH 5/8] post merge edits --- bigframes/series.py | 4 +++- tests/system/small/test_series.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 4adffc27e2..2729dd82c6 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1193,7 +1193,8 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore return (self.rfloordiv(other), self.rmod(other)) def dot(self, other): - return (self * other).sum() + if isinstance(other, Series): + return (self * other).sum() # At this point other must be a DataFrame if len(other.columns.names) == 1: @@ -1203,6 +1204,7 @@ def dot(self, other): [(self * other[col]).sum() for col in other.columns], index=other.columns, name=self.name, + session=self._session, ) result = mul_df.mask(na_df) else: diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6595f9d268..4f12ae9ae1 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3187,7 +3187,11 @@ def test_dot_df_with_na(scalars_dfs): ) pd.testing.assert_series_equal( - bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False + bf_result.to_pandas(), + pd_result, + check_index_type=False, + check_dtype=False, + check_exact=False, ) From c82e55386367dd8982c4c4425cb3c399772b743c Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 28 Jul 2025 10:24:18 +0000 Subject: [PATCH 6/8] use DataFrame.dot --- bigframes/series.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 2729dd82c6..211cb598f9 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1198,20 +1198,17 @@ def dot(self, other): # At this point other must be a DataFrame if len(other.columns.names) == 1: - # Single level columns in other - na_df = other.isna().any() - mul_df = Series( - [(self * other[col]).sum() for col in other.columns], - index=other.columns, - name=self.name, - session=self._session, - ) - result = mul_df.mask(na_df) + # Process single level columns in other + # Let's leverage the DataFrame.dot + na_mask = other.isna().any() + self_as_row = self.to_frame().T + frame_dot_result_as_row = self_as_row.dot(other) + frame_dot_result_as_col = frame_dot_result_as_row.T + series_dot_result = frame_dot_result_as_col[self.name] + result = series_dot_result.mask(na_mask) else: - # Multi level columns in other - # TODO(b/313747368): Remove this once DataFrame.any() honors - # multi-level index, as the logic in the if clause should generalize - # for multi-level columns in other + # TODO: Remove this special code path after DataFrame.dot supports + # multi-level columns. result = Series( [ pandas.NA if other[col].isna().any() else (self * other[col]).sum() From f49385a2ff109e21f31b045b95f4effa00d7542a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 28 Jul 2025 22:44:51 +0000 Subject: [PATCH 7/8] handle unnamed series input --- bigframes/series.py | 15 ++++++++++++--- tests/system/small/test_series.py | 23 +++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 211cb598f9..03bbcbab75 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -53,6 +53,7 @@ import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression as ex +import bigframes.core.guid as guid import bigframes.core.indexers import bigframes.core.indexes as indexes import bigframes.core.ordering as order @@ -1200,12 +1201,20 @@ def dot(self, other): if len(other.columns.names) == 1: # Process single level columns in other # Let's leverage the DataFrame.dot - na_mask = other.isna().any() - self_as_row = self.to_frame().T + self_named = self + if self_named.name is None: + self_named = self.copy() + self_named.name = guid.generate_guid() + + self_as_row = self_named.to_frame().T frame_dot_result_as_row = self_as_row.dot(other) frame_dot_result_as_col = frame_dot_result_as_row.T - series_dot_result = frame_dot_result_as_col[self.name] + series_dot_result = frame_dot_result_as_col[self_named.name] + + # take care of the NA values + na_mask = other.isna().any() result = series_dot_result.mask(na_mask) + result.name = self.name else: # TODO: Remove this special code path after DataFrame.dot supports # multi-level columns. diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 4f12ae9ae1..44b4bbfdc4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3195,6 +3195,29 @@ def test_dot_df_with_na(scalars_dfs): ) +def test_dot_df_unnamed(session): + ps = pd.Series([0, 1, 2, 3]) + assert ps.name is None # this is the scenario we are testing specifically + + pdf = pd.DataFrame( + {"a": [-1, 2, -3, 4], "b": [-10, 20, -30, 40], "c": [-1, 2, -3, pd.NA]} + ) + + s = session.read_pandas(ps) + df = session.read_pandas(pdf) + + pd_result = ps @ pdf + bf_result = s @ df + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + check_index_type=False, + check_dtype=False, + check_exact=False, + ) + + @pytest.mark.parametrize( ("left", "right", "inclusive"), [ From a139e663c15b63c4160cd488df63a81a236d31f6 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 28 Jul 2025 23:34:27 +0000 Subject: [PATCH 8/8] adjust dot doctest --- third_party/bigframes_vendored/pandas/core/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 075e18cbc2..e9015a01ae 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1480,10 +1480,10 @@ def dot(self, other) -> Series | np.ndarray: ... "b" : [-10, 20, -30, 40], ... "c" : [-1, 2, -3, bpd.NA]}) >>> s @ other - a 8.0 - b 80.0 + a 8 + b 80 c - dtype: Float64 + dtype: Int64 Args: other (Series, or DataFrame):