diff --git a/bigframes/series.py b/bigframes/series.py index 3a1af0bb1d..03bbcbab75 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -53,6 +53,7 @@ import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression as ex +import bigframes.core.guid as guid import bigframes.core.indexers import bigframes.core.indexes as indexes import bigframes.core.ordering as order @@ -1193,7 +1194,40 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore return (self.rfloordiv(other), self.rmod(other)) def dot(self, other): - return (self * other).sum() + if isinstance(other, Series): + return (self * other).sum() + + # At this point other must be a DataFrame + if len(other.columns.names) == 1: + # Process single level columns in other + # Let's leverage the DataFrame.dot + self_named = self + if self_named.name is None: + self_named = self.copy() + self_named.name = guid.generate_guid() + + self_as_row = self_named.to_frame().T + frame_dot_result_as_row = self_as_row.dot(other) + frame_dot_result_as_col = frame_dot_result_as_row.T + series_dot_result = frame_dot_result_as_col[self_named.name] + + # take care of the NA values + na_mask = other.isna().any() + result = series_dot_result.mask(na_mask) + result.name = self.name + else: + # TODO: Remove this special code path after DataFrame.dot supports + # multi-level columns. + result = Series( + [ + pandas.NA if other[col].isna().any() else (self * other[col]).sum() + for col in other.columns + ], + index=other.columns, + name=self.name, + ) + + return result def __matmul__(self, other): return self.dot(other) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 13b5b1886f..80f38267f2 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -1411,3 +1411,26 @@ def test_multi_index_contains(scalars_df_index, scalars_pandas_df_index, key): pd_result = key in scalars_pandas_df_index.set_index(col_name).index assert bf_result == pd_result + + +def test_series_dot_df_column_multi_index(): + left = [10, 11, 12, 13] # series data + right = [[0, 1, 2], [-2, 3, -4], [4, -5, 6], [6, 7, -8]] # dataframe data + + multi_level_columns = pandas.MultiIndex.from_arrays( + [["col0", "col0", "col1"], ["col00", "col01", "col11"]] + ) + + bf_left_s = bpd.Series(left) + bf_right_df = bpd.DataFrame(right) + bf_right_df.columns = multi_level_columns + bf_result = bf_left_s @ bf_right_df + + pd_left_s = pandas.Series(left) + pd_right_df = pandas.DataFrame(right) + pd_right_df.columns = multi_level_columns + pd_result = pd_left_s @ pd_right_df + + pandas.testing.assert_series_equal( + bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False + ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 3f64234293..44b4bbfdc4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3170,6 +3170,54 @@ def test_dot(scalars_dfs): assert bf_result == pd_result +def test_dot_df(matrix_3by4_df, matrix_3by4_pandas_df): + bf_result = matrix_3by4_df["w"] @ matrix_3by4_df + pd_result = matrix_3by4_pandas_df["w"] @ matrix_3by4_pandas_df + + pd.testing.assert_series_equal( + bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False + ) + + +def test_dot_df_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"] @ scalars_df[["int64_col", "int64_too"]] + pd_result = ( + scalars_pandas_df["int64_too"] @ scalars_pandas_df[["int64_col", "int64_too"]] + ) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + check_index_type=False, + check_dtype=False, + check_exact=False, + ) + + +def test_dot_df_unnamed(session): + ps = pd.Series([0, 1, 2, 3]) + assert ps.name is None # this is the scenario we are testing specifically + + pdf = pd.DataFrame( + {"a": [-1, 2, -3, 4], "b": [-10, 20, -30, 40], "c": [-1, 2, -3, pd.NA]} + ) + + s = session.read_pandas(ps) + df = session.read_pandas(pdf) + + pd_result = ps @ pdf + bf_result = s @ df + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + check_index_type=False, + check_dtype=False, + check_exact=False, + ) + + @pytest.mark.parametrize( ("left", "right", "inclusive"), [ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 0160a7eb50..e9015a01ae 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1449,8 +1449,7 @@ def dot(self, other) -> Series | np.ndarray: Compute the dot product between the Series and the columns of other. This method computes the dot product between the Series and another - one, or the Series and each columns of a DataFrame, or the Series and - each columns of an array. + one, or the Series and each columns of a DataFrame. It can also be called using `self @ other` in Python >= 3.5. @@ -1475,8 +1474,19 @@ def dot(self, other) -> Series | np.ndarray: >>> s @ other np.int64(8) + The other operand can be a DataFrame: + + >>> other = bpd.DataFrame({"a" : [-1, 2, -3, 4], + ... "b" : [-10, 20, -30, 40], + ... "c" : [-1, 2, -3, bpd.NA]}) + >>> s @ other + a 8 + b 80 + c + dtype: Int64 + Args: - other (Series): + other (Series, or DataFrame): The other object to compute the dot product with its columns. Returns: