Skip to content

Commit c13ea37

Browse files
authored
Merge branch 'main' into b329865893-groupby-iter
2 parents b4214cf + 9dc9695 commit c13ea37

File tree

13 files changed

+340
-124
lines changed

13 files changed

+340
-124
lines changed

bigframes/core/array_value.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,14 @@ def relational_join(
480480
type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner",
481481
propogate_order: Optional[bool] = None,
482482
) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]:
483+
for lcol, rcol in conditions:
484+
ltype = self.get_column_type(lcol)
485+
rtype = other.get_column_type(rcol)
486+
if not bigframes.dtypes.can_compare(ltype, rtype):
487+
raise TypeError(
488+
f"Cannot join with non-comparable join key types: {ltype}, {rtype}"
489+
)
490+
483491
l_mapping = { # Identity mapping, only rename right side
484492
lcol.name: lcol.name for lcol in self.node.ids
485493
}

bigframes/core/blocks.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1786,15 +1786,19 @@ def pivot(
17861786
else:
17871787
return result_block.with_column_labels(columns_values)
17881788

1789-
def stack(self, how="left", levels: int = 1):
1789+
def stack(
1790+
self, how="left", levels: int = 1, *, override_labels: Optional[pd.Index] = None
1791+
):
17901792
"""Unpivot last column axis level into row axis"""
17911793
if levels == 0:
17921794
return self
17931795

17941796
# These are the values that will be turned into rows
17951797

17961798
col_labels, row_labels = utils.split_index(self.column_labels, levels=levels)
1797-
row_labels = row_labels.drop_duplicates()
1799+
row_labels = (
1800+
row_labels.drop_duplicates() if override_labels is None else override_labels
1801+
)
17981802

17991803
if col_labels is None:
18001804
result_index: pd.Index = pd.Index([None])

bigframes/core/compile/api.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,18 @@
1515

1616
from typing import TYPE_CHECKING
1717

18-
from bigframes.core import rewrite
19-
from bigframes.core.compile.ibis_compiler import ibis_compiler
20-
2118
if TYPE_CHECKING:
2219
import bigframes.core.nodes
2320

2421

2522
def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
2623
"""Use only for testing paths to ensure ibis inferred schema does not diverge from bigframes inferred schema."""
24+
from bigframes.core.compile.ibis_compiler import ibis_compiler
25+
import bigframes.core.rewrite
2726
import bigframes.core.schema
2827

2928
node = ibis_compiler._replace_unsupported_ops(node)
30-
node = rewrite.bake_order(node)
29+
node = bigframes.core.rewrite.bake_order(node)
3130
ir = ibis_compiler.compile_node(node)
3231
items = tuple(
3332
bigframes.core.schema.SchemaItem(name, ir.get_column_type(ibis_id))

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,20 @@ def head(self, n: int = 5) -> df.DataFrame:
153153
)
154154
)
155155

156+
def describe(self, include: None | Literal["all"] = None):
157+
from bigframes.pandas.core.methods import describe
158+
159+
return df.DataFrame(
160+
describe._describe(
161+
self._block,
162+
self._selected_cols,
163+
include,
164+
as_index=self._as_index,
165+
by_col_ids=self._by_col_ids,
166+
dropna=self._dropna,
167+
)
168+
)
169+
156170
def __iter__(self) -> Iterable[Tuple[blocks.Label, df.DataFrame]]:
157171
for group_keys, filtered_block in group_by.block_groupby_iter(
158172
self._block,

bigframes/core/groupby/series_group_by.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,20 @@ def head(self, n: int = 5) -> series.Series:
8181
)
8282
)
8383

84+
def describe(self, include: None | Literal["all"] = None):
85+
from bigframes.pandas.core.methods import describe
86+
87+
return df.DataFrame(
88+
describe._describe(
89+
self._block,
90+
columns=[self._value_column],
91+
include=include,
92+
as_index=True,
93+
by_col_ids=self._by_col_ids,
94+
dropna=self._dropna,
95+
)
96+
).droplevel(level=0, axis=1)
97+
8498
def __iter__(self) -> Iterable[Tuple[blocks.Label, series.Series]]:
8599
for group_keys, filtered_block in group_by.block_groupby_iter(
86100
self._block,

bigframes/core/rewrite/implicit_align.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,8 @@
1818
from typing import cast, Optional, Sequence, Set, Tuple
1919

2020
import bigframes.core.expression
21-
import bigframes.core.guid
2221
import bigframes.core.identifiers
23-
import bigframes.core.join_def
2422
import bigframes.core.nodes
25-
import bigframes.core.window_spec
26-
import bigframes.operations.aggregations
2723

2824
# Combination of selects and additive nodes can be merged as an explicit keyless "row join"
2925
ALIGNABLE_NODES = (

bigframes/dtypes.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,11 @@ def is_comparable(type_: ExpressionType) -> bool:
358358
return (type_ is not None) and is_orderable(type_)
359359

360360

361+
def can_compare(type1: ExpressionType, type2: ExpressionType) -> bool:
362+
coerced_type = coerce_to_common(type1, type2)
363+
return is_comparable(coerced_type)
364+
365+
361366
def get_struct_fields(type_: ExpressionType) -> dict[str, Dtype]:
362367
assert isinstance(type_, pd.ArrowDtype)
363368
assert isinstance(type_.pyarrow_dtype, pa.StructType)

bigframes/operations/aggregations.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -251,12 +251,7 @@ def name(self):
251251
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
252252
if not dtypes.is_orderable(input_types[0]):
253253
raise TypeError(f"Type {input_types[0]} is not orderable")
254-
if pd.api.types.is_bool_dtype(input_types[0]) or pd.api.types.is_integer_dtype(
255-
input_types[0]
256-
):
257-
return dtypes.FLOAT_DTYPE
258-
else:
259-
return input_types[0]
254+
return input_types[0]
260255

261256

262257
@dataclasses.dataclass(frozen=True)

bigframes/operations/type.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -174,15 +174,7 @@ class CoerceCommon(BinaryTypeSignature):
174174
def output_type(
175175
self, left_type: ExpressionType, right_type: ExpressionType
176176
) -> ExpressionType:
177-
try:
178-
return bigframes.dtypes.coerce_to_common(left_type, right_type)
179-
except TypeError:
180-
pass
181-
if bigframes.dtypes.can_coerce(left_type, right_type):
182-
return right_type
183-
if bigframes.dtypes.can_coerce(right_type, left_type):
184-
return left_type
185-
raise TypeError(f"Cannot coerce {left_type} and {right_type} to a common type.")
177+
return bigframes.dtypes.coerce_to_common(left_type, right_type)
186178

187179

188180
@dataclasses.dataclass
@@ -192,8 +184,7 @@ class Comparison(BinaryTypeSignature):
192184
def output_type(
193185
self, left_type: ExpressionType, right_type: ExpressionType
194186
) -> ExpressionType:
195-
common_type = CoerceCommon().output_type(left_type, right_type)
196-
if not bigframes.dtypes.is_comparable(common_type):
187+
if not bigframes.dtypes.can_compare(left_type, right_type):
197188
raise TypeError(f"Types {left_type} and {right_type} are not comparable")
198189
return bigframes.dtypes.BOOL_DTYPE
199190

bigframes/pandas/core/methods/describe.py

Lines changed: 90 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,15 @@
1616

1717
import typing
1818

19+
import pandas as pd
20+
1921
from bigframes import dataframe, dtypes, series
20-
from bigframes.core.reshape import api as rs
22+
from bigframes.core import agg_expressions, blocks
23+
from bigframes.operations import aggregations
24+
25+
_DEFAULT_DTYPES = (
26+
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
27+
)
2128

2229

2330
def describe(
@@ -30,100 +37,88 @@ def describe(
3037
elif not isinstance(input, dataframe.DataFrame):
3138
raise TypeError(f"Unsupported type: {type(input)}")
3239

40+
block = input._block
41+
42+
describe_block = _describe(block, columns=block.value_columns, include=include)
43+
# we override default stack behavior, because we want very specific ordering
44+
stack_cols = pd.Index(
45+
[
46+
"count",
47+
"nunique",
48+
"top",
49+
"freq",
50+
"mean",
51+
"std",
52+
"min",
53+
"25%",
54+
"50%",
55+
"75%",
56+
"max",
57+
]
58+
).intersection(describe_block.column_labels.get_level_values(-1))
59+
describe_block = describe_block.stack(override_labels=stack_cols)
60+
61+
return dataframe.DataFrame(describe_block).droplevel(level=0)
62+
63+
64+
def _describe(
65+
block: blocks.Block,
66+
columns: typing.Sequence[str],
67+
include: None | typing.Literal["all"] = None,
68+
*,
69+
as_index: bool = True,
70+
by_col_ids: typing.Sequence[str] = [],
71+
dropna: bool = False,
72+
) -> blocks.Block:
73+
stats: list[agg_expressions.Aggregation] = []
74+
column_labels: list[typing.Hashable] = []
75+
76+
# include=None behaves like include='all' if no numeric columns present
3377
if include is None:
34-
numeric_df = _select_dtypes(
35-
input,
36-
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
37-
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
38-
)
39-
if len(numeric_df.columns) == 0:
40-
# Describe eligible non-numeric columns
41-
return _describe_non_numeric(input)
42-
43-
# Otherwise, only describe numeric columns
44-
return _describe_numeric(input)
45-
46-
elif include == "all":
47-
numeric_result = _describe_numeric(input)
48-
non_numeric_result = _describe_non_numeric(input)
49-
50-
if len(numeric_result.columns) == 0:
51-
return non_numeric_result
52-
elif len(non_numeric_result.columns) == 0:
53-
return numeric_result
54-
else:
55-
# Use reindex after join to preserve the original column order.
56-
return rs.concat(
57-
[non_numeric_result, numeric_result], axis=1
58-
)._reindex_columns(input.columns)
59-
60-
else:
61-
raise ValueError(f"Unsupported include type: {include}")
62-
63-
64-
def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
65-
number_df_result = typing.cast(
66-
dataframe.DataFrame,
67-
_select_dtypes(df, dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE).agg(
68-
[
69-
"count",
70-
"mean",
71-
"std",
72-
"min",
73-
"25%",
74-
"50%",
75-
"75%",
76-
"max",
77-
]
78-
),
79-
)
80-
temporal_df_result = typing.cast(
81-
dataframe.DataFrame,
82-
_select_dtypes(df, dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES).agg(["count"]),
78+
if not any(
79+
block.expr.get_column_type(col) in _DEFAULT_DTYPES for col in columns
80+
):
81+
include = "all"
82+
83+
for col_id in columns:
84+
label = block.col_id_to_label[col_id]
85+
dtype = block.expr.get_column_type(col_id)
86+
if include != "all" and dtype not in _DEFAULT_DTYPES:
87+
continue
88+
agg_ops = _get_aggs_for_dtype(dtype)
89+
stats.extend(op.as_expr(col_id) for op in agg_ops)
90+
label_tuple = (label,) if block.column_labels.nlevels == 1 else label
91+
column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore
92+
93+
agg_block, _ = block.aggregate(
94+
by_column_ids=by_col_ids,
95+
aggregations=stats,
96+
dropna=dropna,
97+
column_labels=pd.Index(column_labels, name=(*block.column_labels.names, None)),
8398
)
84-
85-
if len(number_df_result.columns) == 0:
86-
return temporal_df_result
87-
elif len(temporal_df_result.columns) == 0:
88-
return number_df_result
99+
return agg_block if as_index else agg_block.reset_index(drop=False)
100+
101+
102+
def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
103+
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE:
104+
return [
105+
aggregations.count_op,
106+
aggregations.mean_op,
107+
aggregations.std_op,
108+
aggregations.min_op,
109+
aggregations.ApproxQuartilesOp(1),
110+
aggregations.ApproxQuartilesOp(2),
111+
aggregations.ApproxQuartilesOp(3),
112+
aggregations.max_op,
113+
]
114+
elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES:
115+
return [aggregations.count_op]
116+
elif dtype in [
117+
dtypes.STRING_DTYPE,
118+
dtypes.BOOL_DTYPE,
119+
dtypes.BYTES_DTYPE,
120+
dtypes.TIME_DTYPE,
121+
]:
122+
return [aggregations.count_op, aggregations.nunique_op]
89123
else:
90-
import bigframes.core.reshape.api as rs
91-
92-
original_columns = _select_dtypes(
93-
df,
94-
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
95-
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
96-
).columns
97-
98-
# Use reindex after join to preserve the original column order.
99-
return rs.concat(
100-
[number_df_result, temporal_df_result],
101-
axis=1,
102-
)._reindex_columns(original_columns)
103-
104-
105-
def _describe_non_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
106-
return typing.cast(
107-
dataframe.DataFrame,
108-
_select_dtypes(
109-
df,
110-
[
111-
dtypes.STRING_DTYPE,
112-
dtypes.BOOL_DTYPE,
113-
dtypes.BYTES_DTYPE,
114-
dtypes.TIME_DTYPE,
115-
],
116-
).agg(["count", "nunique"]),
117-
)
118-
119-
120-
def _select_dtypes(
121-
df: dataframe.DataFrame, dtypes: typing.Sequence[dtypes.Dtype]
122-
) -> dataframe.DataFrame:
123-
"""Selects columns without considering inheritance relationships."""
124-
columns = [
125-
col_id
126-
for col_id, dtype in zip(df._block.value_columns, df._block.dtypes)
127-
if dtype in dtypes
128-
]
129-
return dataframe.DataFrame(df._block.select_columns(columns))
124+
return []

0 commit comments

Comments
 (0)