Skip to content

Commit 5c125c9

Browse files
perf: defer query in read_gbq with wildcard tables (#1661)
* perf: defer query in `read_gbq` with wildcard tables * remove obsolete comments * use sql node instead of ibis table node to keep select * from omitting pseudocolumns Fixes this code sample: import bigframes.pandas as bpd df = bpd.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*") df[df["_TABLE_SUFFIX"] == "20161204"].peek() * test with cache and to_gbq * rename columns before caching * remove unnecessary comment * add missing import * do not materialize _TABLE_SUFFIX * fix unit tests * correct number of columns in cache with offsets * fix formatting * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * revert datetime change, max_results change * add pseudocolumns to node * fix unit tests * actually fix unit tests * try to rename as part of compile * use correct node for table schema * revert pseudocolumn addition * revert pseudocolumn fix * add test for warning --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
1 parent 8ec6079 commit 5c125c9

File tree

3 files changed

+18
-8
lines changed

3 files changed

+18
-8
lines changed

bigframes/session/_io/bigquery/read_gbq_table.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,16 @@ def validate_table(
101101
# Anonymous dataset, does not support snapshot ever
102102
if table.dataset_id.startswith("_"):
103103
pass
104+
104105
# Only true tables support time travel
106+
elif table.table_id.endswith("*"):
107+
msg = bfe.format_message(
108+
"Wildcard tables do not support FOR SYSTEM_TIME AS OF queries. "
109+
"Attempting query without time travel. Be aware that "
110+
"modifications to the underlying data may result in errors or "
111+
"unexpected behavior."
112+
)
113+
warnings.warn(msg, category=bfe.TimeTravelDisabledWarning)
105114
elif table.table_type != "TABLE":
106115
if table.table_type == "MATERIALIZED_VIEW":
107116
msg = bfe.format_message(
@@ -137,7 +146,7 @@ def validate_table(
137146
sql_predicate=filter_str,
138147
time_travel_timestamp=None,
139148
)
140-
# Any erorrs here should just be raised to user
149+
# Any errors here should just be raised to user
141150
bqclient.query_and_wait(
142151
snapshot_sql, job_config=bigquery.QueryJobConfig(dry_run=True)
143152
)

bigframes/session/loader.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -518,11 +518,7 @@ def read_gbq_table(
518518
# clustered tables, so fallback to a query. We do this here so that
519519
# the index is consistent with tables that have primary keys, even
520520
# when max_results is set.
521-
# TODO(b/338419730): We don't need to fallback to a query for wildcard
522-
# tables if we allow some non-determinism when time travel isn't supported.
523-
if max_results is not None or bf_io_bigquery.is_table_with_wildcard_suffix(
524-
table_id
525-
):
521+
if max_results is not None:
526522
# TODO(b/338111344): If we are running a query anyway, we might as
527523
# well generate ROW_NUMBER() at the same time.
528524
all_columns: Iterable[str] = (
@@ -540,14 +536,15 @@ def read_gbq_table(
540536
time_travel_timestamp=None,
541537
)
542538

543-
return self.read_gbq_query( # type: ignore # for dry_run overload
539+
df = self.read_gbq_query( # type: ignore # for dry_run overload
544540
query,
545541
index_col=index_cols,
546542
columns=columns,
547543
api_name=api_name,
548544
use_cache=use_cache,
549545
dry_run=dry_run,
550546
)
547+
return df
551548

552549
if dry_run:
553550
return dry_runs.get_table_stats(table)

tests/system/small/test_session.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,11 +449,15 @@ def test_read_gbq_twice_with_same_timestamp(session, penguins_table_id):
449449
@pytest.mark.parametrize(
450450
"source_table",
451451
[
452+
# Wildcard tables
453+
"bigquery-public-data.noaa_gsod.gsod194*",
454+
# Linked datasets
452455
"bigframes-dev.thelook_ecommerce.orders",
456+
# Materialized views
453457
"bigframes-dev.bigframes_tests_sys.base_table_mat_view",
454458
],
455459
)
456-
def test_read_gbq_on_linked_dataset_warns(session, source_table):
460+
def test_read_gbq_warns_time_travel_disabled(session, source_table):
457461
with warnings.catch_warnings(record=True) as warned:
458462
session.read_gbq(source_table, use_cache=False)
459463
assert len(warned) == 1

0 commit comments

Comments
 (0)