Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import bigframes.core.block_transforms as block_ops
import bigframes.core.blocks as blocks
import bigframes.core.expression as ex
import bigframes.core.guid as guid
import bigframes.core.indexers
import bigframes.core.indexes as indexes
import bigframes.core.ordering as order
Expand Down Expand Up @@ -1193,7 +1194,40 @@ def rdivmod(self, other) -> Tuple[Series, Series]: # type: ignore
return (self.rfloordiv(other), self.rmod(other))

def dot(self, other):
return (self * other).sum()
if isinstance(other, Series):
return (self * other).sum()

# At this point other must be a DataFrame
if len(other.columns.names) == 1:
# Process single level columns in other
# Let's leverage the DataFrame.dot
self_named = self
if self_named.name is None:
self_named = self.copy()
self_named.name = guid.generate_guid()

self_as_row = self_named.to_frame().T
frame_dot_result_as_row = self_as_row.dot(other)
frame_dot_result_as_col = frame_dot_result_as_row.T
series_dot_result = frame_dot_result_as_col[self_named.name]

# take care of the NA values
na_mask = other.isna().any()
result = series_dot_result.mask(na_mask)
result.name = self.name
else:
# TODO: Remove this special code path after DataFrame.dot supports
# multi-level columns.
result = Series(
[
pandas.NA if other[col].isna().any() else (self * other[col]).sum()
for col in other.columns
],
index=other.columns,
name=self.name,
)

return result

def __matmul__(self, other):
return self.dot(other)
Expand Down
23 changes: 23 additions & 0 deletions tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1411,3 +1411,26 @@ def test_multi_index_contains(scalars_df_index, scalars_pandas_df_index, key):
pd_result = key in scalars_pandas_df_index.set_index(col_name).index

assert bf_result == pd_result


def test_series_dot_df_column_multi_index():
left = [10, 11, 12, 13] # series data
right = [[0, 1, 2], [-2, 3, -4], [4, -5, 6], [6, 7, -8]] # dataframe data

multi_level_columns = pandas.MultiIndex.from_arrays(
[["col0", "col0", "col1"], ["col00", "col01", "col11"]]
)

bf_left_s = bpd.Series(left)
bf_right_df = bpd.DataFrame(right)
bf_right_df.columns = multi_level_columns
bf_result = bf_left_s @ bf_right_df

pd_left_s = pandas.Series(left)
pd_right_df = pandas.DataFrame(right)
pd_right_df.columns = multi_level_columns
pd_result = pd_left_s @ pd_right_df

pandas.testing.assert_series_equal(
bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False
)
48 changes: 48 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3170,6 +3170,54 @@ def test_dot(scalars_dfs):
assert bf_result == pd_result


def test_dot_df(matrix_3by4_df, matrix_3by4_pandas_df):
bf_result = matrix_3by4_df["w"] @ matrix_3by4_df
pd_result = matrix_3by4_pandas_df["w"] @ matrix_3by4_pandas_df

pd.testing.assert_series_equal(
bf_result.to_pandas(), pd_result, check_index_type=False, check_dtype=False
)


def test_dot_df_with_na(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df["int64_too"] @ scalars_df[["int64_col", "int64_too"]]
pd_result = (
scalars_pandas_df["int64_too"] @ scalars_pandas_df[["int64_col", "int64_too"]]
)

pd.testing.assert_series_equal(
bf_result.to_pandas(),
pd_result,
check_index_type=False,
check_dtype=False,
check_exact=False,
)


def test_dot_df_unnamed(session):
ps = pd.Series([0, 1, 2, 3])
assert ps.name is None # this is the scenario we are testing specifically

pdf = pd.DataFrame(
{"a": [-1, 2, -3, 4], "b": [-10, 20, -30, 40], "c": [-1, 2, -3, pd.NA]}
)

s = session.read_pandas(ps)
df = session.read_pandas(pdf)

pd_result = ps @ pdf
bf_result = s @ df

pd.testing.assert_series_equal(
bf_result.to_pandas(),
pd_result,
check_index_type=False,
check_dtype=False,
check_exact=False,
)


@pytest.mark.parametrize(
("left", "right", "inclusive"),
[
Expand Down
16 changes: 13 additions & 3 deletions third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1449,8 +1449,7 @@ def dot(self, other) -> Series | np.ndarray:
Compute the dot product between the Series and the columns of other.

This method computes the dot product between the Series and another
one, or the Series and each columns of a DataFrame, or the Series and
each columns of an array.
one, or the Series and each columns of a DataFrame.

It can also be called using `self @ other` in Python >= 3.5.

Expand All @@ -1475,8 +1474,19 @@ def dot(self, other) -> Series | np.ndarray:
>>> s @ other
np.int64(8)

The other operand can be a DataFrame:

>>> other = bpd.DataFrame({"a" : [-1, 2, -3, 4],
... "b" : [-10, 20, -30, 40],
... "c" : [-1, 2, -3, bpd.NA]})
>>> s @ other
a 8
b 80
c <NA>
dtype: Int64

Args:
other (Series):
other (Series, or DataFrame):
The other object to compute the dot product with its columns.

Returns:
Expand Down