|
18 | 18 |
|
19 | 19 | import datetime |
20 | 20 | import re |
| 21 | +import sys |
21 | 22 | import textwrap |
22 | 23 | import typing |
23 | 24 | from typing import ( |
|
36 | 37 | import google.cloud.bigquery as bigquery |
37 | 38 | import numpy |
38 | 39 | import pandas |
| 40 | +import tabulate |
39 | 41 |
|
40 | 42 | import bigframes |
41 | 43 | import bigframes._config.display_options as display_options |
@@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]: |
350 | 352 | self._set_internal_query_job(self._compute_dry_run()) |
351 | 353 | return self._query_job |
352 | 354 |
|
| 355 | + def memory_usage(self, index: bool = True): |
| 356 | + n_rows, _ = self.shape |
| 357 | + # like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object |
| 358 | + column_sizes = self.dtypes.map( |
| 359 | + lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows |
| 360 | + ) |
| 361 | + if index: |
| 362 | + index_size = pandas.Series([self.index._memory_usage()], index=["Index"]) |
| 363 | + column_sizes = pandas.concat([index_size, column_sizes]) |
| 364 | + return column_sizes |
| 365 | + |
| 366 | + def info( |
| 367 | + self, |
| 368 | + verbose: Optional[bool] = None, |
| 369 | + buf=None, |
| 370 | + max_cols: Optional[int] = None, |
| 371 | + memory_usage: Optional[bool] = None, |
| 372 | + show_counts: Optional[bool] = None, |
| 373 | + ): |
| 374 | + obuf = buf or sys.stdout |
| 375 | + |
| 376 | + n_rows, n_columns = self.shape |
| 377 | + |
| 378 | + max_cols = ( |
| 379 | + max_cols |
| 380 | + if max_cols is not None |
| 381 | + else bigframes.options.display.max_info_columns |
| 382 | + ) |
| 383 | + |
| 384 | + show_all_columns = verbose if verbose is not None else (n_columns < max_cols) |
| 385 | + |
| 386 | + obuf.write(f"{type(self)}\n") |
| 387 | + |
| 388 | + index_type = "MultiIndex" if self.index.nlevels > 1 else "Index" |
| 389 | + |
| 390 | + # These accessses are kind of expensive, maybe should try to skip? |
| 391 | + first_indice = self.index[0] |
| 392 | + last_indice = self.index[-1] |
| 393 | + obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n") |
| 394 | + |
| 395 | + dtype_strings = self.dtypes.astype("string") |
| 396 | + if show_all_columns: |
| 397 | + obuf.write(f"Data columns (total {n_columns} columns):\n") |
| 398 | + column_info = self.columns.to_frame(name="Column") |
| 399 | + |
| 400 | + max_rows = bigframes.options.display.max_info_rows |
| 401 | + too_many_rows = n_rows > max_rows if max_rows is not None else False |
| 402 | + |
| 403 | + if show_counts if show_counts is not None else (not too_many_rows): |
| 404 | + non_null_counts = self.count().to_pandas() |
| 405 | + column_info["Non-Null Count"] = non_null_counts.map( |
| 406 | + lambda x: f"{int(x)} non-null" |
| 407 | + ) |
| 408 | + |
| 409 | + column_info["Dtype"] = dtype_strings |
| 410 | + |
| 411 | + column_info = column_info.reset_index(drop=True) |
| 412 | + column_info.index.name = "#" |
| 413 | + |
| 414 | + column_info_formatted = tabulate.tabulate(column_info, headers="keys") # type: ignore |
| 415 | + obuf.write(column_info_formatted) |
| 416 | + obuf.write("\n") |
| 417 | + |
| 418 | + else: # Just number of columns and first, last |
| 419 | + obuf.write( |
| 420 | + f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n" |
| 421 | + ) |
| 422 | + dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items() |
| 423 | + dtype_counts_formatted = ", ".join( |
| 424 | + f"{dtype}({count})" for dtype, count in dtype_counts |
| 425 | + ) |
| 426 | + obuf.write(f"dtypes: {dtype_counts_formatted}\n") |
| 427 | + |
| 428 | + show_memory = ( |
| 429 | + memory_usage |
| 430 | + if memory_usage is not None |
| 431 | + else bigframes.options.display.memory_usage |
| 432 | + ) |
| 433 | + if show_memory: |
| 434 | + # TODO: Convert to different units (kb, mb, etc.) |
| 435 | + obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n") |
| 436 | + |
353 | 437 | def _set_internal_query_job(self, query_job: bigquery.QueryJob): |
354 | 438 | self._query_job = query_job |
355 | 439 |
|
|
0 commit comments