|
18 | 18 |
|
19 | 19 | import datetime |
20 | 20 | import inspect |
| 21 | +import itertools |
21 | 22 | import re |
22 | 23 | import sys |
23 | 24 | import textwrap |
|
70 | 71 | import bigframes.exceptions |
71 | 72 | import bigframes.formatting_helpers as formatter |
72 | 73 | import bigframes.operations as ops |
| 74 | +import bigframes.operations.aggregations |
73 | 75 | import bigframes.operations.aggregations as agg_ops |
74 | 76 | import bigframes.operations.plotting as plotting |
75 | 77 | import bigframes.operations.structs |
@@ -2207,14 +2209,17 @@ def agg( |
2207 | 2209 | self, func: str | typing.Sequence[str] |
2208 | 2210 | ) -> DataFrame | bigframes.series.Series: |
2209 | 2211 | if utils.is_list_like(func): |
2210 | | - if any( |
2211 | | - dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE |
2212 | | - for dtype in self.dtypes |
2213 | | - ): |
2214 | | - raise NotImplementedError( |
2215 | | - f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}" |
2216 | | - ) |
2217 | 2212 | aggregations = [agg_ops.lookup_agg_func(f) for f in func] |
| 2213 | + |
| 2214 | + for dtype, agg in itertools.product(self.dtypes, aggregations): |
| 2215 | + if not bigframes.operations.aggregations.is_agg_op_supported( |
| 2216 | + dtype, agg |
| 2217 | + ): |
| 2218 | + raise NotImplementedError( |
| 2219 | + f"Type {dtype} does not support aggregation {agg}. " |
| 2220 | + f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}" |
| 2221 | + ) |
| 2222 | + |
2218 | 2223 | return DataFrame( |
2219 | 2224 | self._block.summarize( |
2220 | 2225 | self._block.value_columns, |
@@ -2280,16 +2285,55 @@ def melt( |
2280 | 2285 | self._block.melt(id_col_ids, val_col_ids, var_name, value_name) |
2281 | 2286 | ) |
2282 | 2287 |
|
2283 | | - def describe(self) -> DataFrame: |
2284 | | - df_numeric = self._drop_non_numeric(permissive=False) |
2285 | | - if len(df_numeric.columns) == 0: |
2286 | | - raise NotImplementedError( |
2287 | | - f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" |
| 2288 | + _NUMERICAL_DISCRIBE_AGGS = ( |
| 2289 | + "count", |
| 2290 | + "mean", |
| 2291 | + "std", |
| 2292 | + "min", |
| 2293 | + "25%", |
| 2294 | + "50%", |
| 2295 | + "75%", |
| 2296 | + "max", |
| 2297 | + ) |
| 2298 | + _NON_NUMERICAL_DESCRIBE_AGGS = ("count", "nunique") |
| 2299 | + |
| 2300 | + def describe(self, include: None | Literal["all"] = None) -> DataFrame: |
| 2301 | + if include is None: |
| 2302 | + numeric_df = self._drop_non_numeric(permissive=False) |
| 2303 | + if len(numeric_df.columns) == 0: |
| 2304 | + # Describe eligible non-numerical columns |
| 2305 | + result = self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS) |
| 2306 | + else: |
| 2307 | + # Otherwise, only describe numerical columns |
| 2308 | + result = numeric_df.agg(self._NUMERICAL_DISCRIBE_AGGS) |
| 2309 | + return typing.cast(DataFrame, result) |
| 2310 | + |
| 2311 | + elif include == "all": |
| 2312 | + numeric_result = typing.cast( |
| 2313 | + DataFrame, |
| 2314 | + self._drop_non_numeric(permissive=False).agg( |
| 2315 | + self._NUMERICAL_DISCRIBE_AGGS |
| 2316 | + ), |
| 2317 | + ) |
| 2318 | + string_result = typing.cast( |
| 2319 | + DataFrame, |
| 2320 | + self._drop_non_string().agg(self._NON_NUMERICAL_DESCRIBE_AGGS), |
2288 | 2321 | ) |
2289 | | - result = df_numeric.agg( |
2290 | | - ["count", "mean", "std", "min", "25%", "50%", "75%", "max"] |
2291 | | - ) |
2292 | | - return typing.cast(DataFrame, result) |
| 2322 | + |
| 2323 | + if len(numeric_result.columns) == 0: |
| 2324 | + return string_result |
| 2325 | + elif len(string_result.columns) == 0: |
| 2326 | + return numeric_result |
| 2327 | + else: |
| 2328 | + import bigframes.core.reshape as rs |
| 2329 | + |
| 2330 | + # Use reindex after join to preserve the original column order. |
| 2331 | + return rs.concat( |
| 2332 | + [numeric_result, string_result], axis=1 |
| 2333 | + )._reindex_columns(self.columns) |
| 2334 | + |
| 2335 | + else: |
| 2336 | + raise ValueError(f"Unsupported include type: {include}") |
2293 | 2337 |
|
2294 | 2338 | def skew(self, *, numeric_only: bool = False): |
2295 | 2339 | if not numeric_only: |
@@ -2487,18 +2531,26 @@ def unstack(self, level: LevelsType = -1): |
2487 | 2531 | return DataFrame(pivot_block) |
2488 | 2532 |
|
2489 | 2533 | def _drop_non_numeric(self, permissive=True) -> DataFrame: |
2490 | | - types_to_keep = ( |
| 2534 | + numerical_types = ( |
2491 | 2535 | set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) |
2492 | 2536 | if permissive |
2493 | 2537 | else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) |
2494 | 2538 | ) |
2495 | 2539 | non_numeric_cols = [ |
2496 | 2540 | col_id |
2497 | 2541 | for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) |
2498 | | - if dtype not in types_to_keep |
| 2542 | + if dtype not in numerical_types |
2499 | 2543 | ] |
2500 | 2544 | return DataFrame(self._block.drop_columns(non_numeric_cols)) |
2501 | 2545 |
|
| 2546 | + def _drop_non_string(self) -> DataFrame: |
| 2547 | + string_cols = [ |
| 2548 | + col_id |
| 2549 | + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) |
| 2550 | + if dtype == bigframes.dtypes.STRING_DTYPE |
| 2551 | + ] |
| 2552 | + return DataFrame(self._block.select_columns(string_cols)) |
| 2553 | + |
2502 | 2554 | def _drop_non_bool(self) -> DataFrame: |
2503 | 2555 | non_bool_cols = [ |
2504 | 2556 | col_id |
|
0 commit comments