Skip to content

Commit eedd929

Browse files
committed
get actual row_counts
1 parent 124f021 commit eedd929

File tree

7 files changed

+148
-77
lines changed

7 files changed

+148
-77
lines changed

bigframes/display/anywidget.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@
1717
from importlib import resources
1818
import functools
1919
import math
20-
import typing
21-
from typing import Any, cast, Dict, Iterator, List, Optional, Type
20+
from typing import Any, Dict, Iterator, List, Optional, Type
2221
import uuid
2322

2423
import pandas as pd
@@ -76,17 +75,19 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
7675
# Respect display options for initial page size
7776
initial_page_size = bigframes.options.display.max_rows
7877

79-
batches = dataframe.to_pandas_batches(
80-
page_size=initial_page_size,
81-
)
82-
self._batches: bigframes.core.blocks.PandasBatches = cast(
83-
bigframes.core.blocks.PandasBatches, batches
78+
execute_result = dataframe._block.session._executor.execute(
79+
dataframe._block.expr,
80+
ordered=True,
81+
use_explicit_destination=True,
8482
)
8583

8684
# The query issued by `to_pandas_batches()` already contains metadata
8785
# about how many results there were. Use that to avoid doing an extra
8886
# COUNT(*) query that `len(...)` would do.
89-
self.row_count = self._batches.total_rows or 0
87+
self.row_count = execute_result.total_rows or 0
88+
89+
# Create pandas batches from the ExecuteResult
90+
self._batches = execute_result.to_pandas_batches(page_size=initial_page_size)
9091

9192
# Set page_size after _batches is available since traitlets observers
9293
# may depend on _batches being initialized when the change trigger happens
@@ -189,8 +190,16 @@ def _cached_data(self) -> pd.DataFrame:
189190

190191
def _reset_batches_for_new_page_size(self):
191192
"""Reset the batch iterator when page size changes."""
192-
batches = self._dataframe.to_pandas_batches(page_size=self.page_size)
193-
self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches)
193+
# Execute with explicit destination for consistency with __init__
194+
execute_result = self._dataframe._block.session._executor.execute(
195+
self._dataframe._block.expr,
196+
ordered=True,
197+
use_explicit_destination=True,
198+
)
199+
200+
# Create pandas batches from the ExecuteResult
201+
self._batches = execute_result.to_pandas_batches(page_size=self.page_size)
202+
194203
self._cached_batches = []
195204
self._batch_iter = None
196205
self._all_data_loaded = False

notebooks/dataframes/anywidget_mode.ipynb

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -73,18 +73,6 @@
7373
"id": "f289d250",
7474
"metadata": {},
7575
"outputs": [
76-
{
77-
"data": {
78-
"text/html": [
79-
"Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1ea2b594-2bd7-46de-a3c8-6aeee5884ba2&page=queryresults\">Open Job</a>"
80-
],
81-
"text/plain": [
82-
"<IPython.core.display.HTML object>"
83-
]
84-
},
85-
"metadata": {},
86-
"output_type": "display_data"
87-
},
8876
{
8977
"name": "stdout",
9078
"output_type": "stream",
@@ -142,7 +130,19 @@
142130
{
143131
"data": {
144132
"text/html": [
145-
"Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:67e679e9-94da-47f7-8be1-8b4a496fbfbd&page=queryresults\">Open Job</a>"
133+
"Query job 3245c62b-5969-4b78-b1f2-4330592d3c65 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:3245c62b-5969-4b78-b1f2-4330592d3c65&page=queryresults\">Open Job</a>"
134+
],
135+
"text/plain": [
136+
"<IPython.core.display.HTML object>"
137+
]
138+
},
139+
"metadata": {},
140+
"output_type": "display_data"
141+
},
142+
{
143+
"data": {
144+
"text/html": [
145+
"Query job 1a5cec48-7128-4986-86a6-369a8f366974 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1a5cec48-7128-4986-86a6-369a8f366974&page=queryresults\">Open Job</a>"
146146
],
147147
"text/plain": [
148148
"<IPython.core.display.HTML object>"
@@ -154,7 +154,7 @@
154154
{
155155
"data": {
156156
"application/vnd.jupyter.widget-view+json": {
157-
"model_id": "e74c3920b93644a0b2afdaa3841cad31",
157+
"model_id": "d59362abcff6445ea879b5f43e0ca9b3",
158158
"version_major": 2,
159159
"version_minor": 1
160160
},
@@ -195,6 +195,30 @@
195195
"id": "6920d49b",
196196
"metadata": {},
197197
"outputs": [
198+
{
199+
"data": {
200+
"text/html": [
201+
"Query job 356f561b-5017-413f-950b-2bc4c7798a24 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:356f561b-5017-413f-950b-2bc4c7798a24&page=queryresults\">Open Job</a>"
202+
],
203+
"text/plain": [
204+
"<IPython.core.display.HTML object>"
205+
]
206+
},
207+
"metadata": {},
208+
"output_type": "display_data"
209+
},
210+
{
211+
"data": {
212+
"text/html": [
213+
"Query job 72162728-56a3-47ce-bdb1-61b038cc2146 is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:72162728-56a3-47ce-bdb1-61b038cc2146&page=queryresults\">Open Job</a>"
214+
],
215+
"text/plain": [
216+
"<IPython.core.display.HTML object>"
217+
]
218+
},
219+
"metadata": {},
220+
"output_type": "display_data"
221+
},
198222
{
199223
"name": "stdout",
200224
"output_type": "stream",
@@ -205,12 +229,13 @@
205229
{
206230
"data": {
207231
"application/vnd.jupyter.widget-view+json": {
208-
"model_id": "b4f7a3f86ef54e07b24ef10061088391",
232+
"model_id": "8fac39e9b92e42d283883137f155526f",
209233
"version_major": 2,
210234
"version_minor": 1
211235
},
212236
"text/plain": [
213237
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
238+
"TableWidget(page_size=10, row_count=5552452, table_html='<table border=\"1\" class=\"dataframe table table-stripe…"
214239
]
215240
},
216241
"execution_count": 6,
@@ -285,10 +310,34 @@
285310
"name": "stderr",
286311
"output_type": "stream",
287312
"text": [
288-
"/usr/local/google/home/swast/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/array_value.py:231: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n",
313+
"/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n",
289314
" warnings.warn(msg, bfe.AmbiguousWindowWarning)\n"
290315
]
291316
},
317+
{
318+
"data": {
319+
"text/html": [
320+
"Query job 77f0582b-b68c-46a7-bf25-463837a4ef3f is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:77f0582b-b68c-46a7-bf25-463837a4ef3f&page=queryresults\">Open Job</a>"
321+
],
322+
"text/plain": [
323+
"<IPython.core.display.HTML object>"
324+
]
325+
},
326+
"metadata": {},
327+
"output_type": "display_data"
328+
},
329+
{
330+
"data": {
331+
"text/html": [
332+
"Query job ec2bcbc2-0f5a-45e9-affc-485183cb245e is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:ec2bcbc2-0f5a-45e9-affc-485183cb245e&page=queryresults\">Open Job</a>"
333+
],
334+
"text/plain": [
335+
"<IPython.core.display.HTML object>"
336+
]
337+
},
338+
"metadata": {},
339+
"output_type": "display_data"
340+
},
292341
{
293342
"name": "stdout",
294343
"output_type": "stream",
@@ -299,12 +348,13 @@
299348
{
300349
"data": {
301350
"application/vnd.jupyter.widget-view+json": {
302-
"model_id": "44a829aca2f24cfdba4b61afd1a259fe",
351+
"model_id": "fe6358fd83d6431198944e601ea00372",
303352
"version_major": 2,
304353
"version_minor": 1
305354
},
306355
"text/plain": [
307356
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
357+
"TableWidget(page_size=10, row_count=5, table_html='<table border=\"1\" class=\"dataframe table table-striped tabl…"
308358
]
309359
},
310360
"execution_count": 8,
@@ -319,14 +369,6 @@
319369
"print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n",
320370
"small_widget"
321371
]
322-
},
323-
{
324-
"cell_type": "code",
325-
"execution_count": null,
326-
"id": "c4e5836b-c872-4a9c-b9ec-14f6f338176d",
327-
"metadata": {},
328-
"outputs": [],
329-
"source": []
330372
}
331373
],
332374
"metadata": {

tests/benchmark/read_gbq_colab/aggregate_output.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import pathlib
15-
import typing
1615

1716
import benchmark.utils as utils
1817

@@ -27,8 +26,13 @@ def aggregate_output(*, project_id, dataset_id, table_id):
2726
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
2827

2928
# Simulate getting the first page, since we'll always do that first in the UI.
30-
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
31-
assert typing.cast(typing.Any, batches).total_rows >= 0
29+
execute_result = df._block.session._executor.execute(
30+
df._block.expr,
31+
ordered=True,
32+
use_explicit_destination=True,
33+
)
34+
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
35+
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
3236
next(iter(batches))
3337

3438
# To simulate very small rows that can only fit a boolean,
@@ -44,8 +48,18 @@ def aggregate_output(*, project_id, dataset_id, table_id):
4448
.groupby("rounded")
4549
.sum(numeric_only=True)
4650
)
47-
48-
batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE)
51+
execute_result_aggregated = df_aggregated._block.session._executor.execute(
52+
df_aggregated._block.expr,
53+
ordered=True,
54+
use_explicit_destination=True,
55+
)
56+
assert (
57+
execute_result_aggregated.total_rows is not None
58+
and execute_result_aggregated.total_rows >= 0
59+
)
60+
batches_aggregated = execute_result_aggregated.to_pandas_batches(
61+
page_size=PAGE_SIZE
62+
)
4963
next(iter(batches_aggregated))
5064

5165

tests/benchmark/read_gbq_colab/filter_output.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,9 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import pathlib
15-
import typing
1615

1716
import benchmark.utils as utils
1817

19-
import bigframes.core.blocks
2018
import bigframes.pandas as bpd
2119

2220
PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE
@@ -33,17 +31,29 @@ def filter_output(
3331
df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}")
3432

3533
# Simulate getting the first page, since we'll always do that first in the UI.
36-
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
34+
# Force BigQuery execution to get total_rows metadata
35+
execute_result = df._block.session._executor.execute(
36+
df._block.expr,
37+
ordered=True,
38+
use_explicit_destination=True,
39+
)
40+
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
3741
next(iter(batches))
3842

3943
# Simulate the user filtering by a column and visualizing those results
4044
df_filtered = df[df["col_bool_0"]]
41-
batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE)
42-
batches_filtered = typing.cast(
43-
bigframes.core.blocks.PandasBatches, batches_filtered
45+
# Force BigQuery execution for filtered DataFrame to get total_rows metadata
46+
execute_result_filtered = df_filtered._block.session._executor.execute(
47+
df_filtered._block.expr,
48+
ordered=True,
49+
use_explicit_destination=True,
4450
)
45-
rows = batches_filtered.total_rows
51+
52+
rows = execute_result_filtered.total_rows or 0
4653
assert rows >= 0
54+
55+
batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE)
56+
4757
# It's possible we don't have any pages at all, since we filtered out all
4858
# matching rows.
4959
first_page = next(iter(batches_filtered))

tests/benchmark/read_gbq_colab/first_page.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import pathlib
15-
import typing
1615

1716
import benchmark.utils as utils
1817

@@ -29,8 +28,13 @@ def first_page(*, project_id, dataset_id, table_id):
2928
)
3029

3130
# Get number of rows (to calculate number of pages) and the first page.
32-
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
33-
assert typing.cast(typing.Any, batches).total_rows >= 0
31+
execute_result = df._block.session._executor.execute(
32+
df._block.expr,
33+
ordered=True,
34+
use_explicit_destination=True,
35+
)
36+
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
37+
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
3438
first_page = next(iter(batches))
3539
assert first_page is not None
3640

tests/benchmark/read_gbq_colab/sort_output.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import pathlib
15-
import typing
1615

1716
import benchmark.utils as utils
1817

@@ -29,8 +28,13 @@ def sort_output(*, project_id, dataset_id, table_id):
2928
)
3029

3130
# Simulate getting the first page, since we'll always do that first in the UI.
32-
batches = df.to_pandas_batches(page_size=PAGE_SIZE)
33-
assert typing.cast(typing.Any, batches).total_rows >= 0
31+
execute_result = df._block.session._executor.execute(
32+
df._block.expr,
33+
ordered=True,
34+
use_explicit_destination=True,
35+
)
36+
assert execute_result.total_rows is not None and execute_result.total_rows >= 0
37+
batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE)
3438
next(iter(batches))
3539

3640
# Simulate the user sorting by a column and visualizing those results
@@ -39,8 +43,16 @@ def sort_output(*, project_id, dataset_id, table_id):
3943
sort_column = "col_bool_0"
4044

4145
df_sorted = df.sort_values(sort_column)
42-
batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE)
43-
assert typing.cast(typing.Any, batches_sorted).total_rows >= 0
46+
execute_result_sorted = df_sorted._block.session._executor.execute(
47+
df_sorted._block.expr,
48+
ordered=True,
49+
use_explicit_destination=True,
50+
)
51+
assert (
52+
execute_result_sorted.total_rows is not None
53+
and execute_result_sorted.total_rows >= 0
54+
)
55+
batches_sorted = execute_result_sorted.to_pandas_batches(page_size=PAGE_SIZE)
4456
next(iter(batches_sorted))
4557

4658

0 commit comments

Comments
 (0)