Skip to content

Commit dd2c27e

Browse files
committed
code change, add more tests
1 parent 45b62d4 commit dd2c27e

File tree

6 files changed

+307
-111
lines changed

6 files changed

+307
-111
lines changed

bigframes/dataframe.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
Union,
4040
)
4141
import warnings
42+
import weakref
4243

4344
import bigframes_vendored.constants as constants
4445
import bigframes_vendored.pandas.core.frame as vendored_pandas_frame
@@ -87,6 +88,7 @@
8788
if typing.TYPE_CHECKING:
8889
from _typeshed import SupportsRichComparison
8990

91+
from bigframes.display.anywidget import TableWidget
9092
import bigframes.session
9193

9294
SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable]
@@ -111,6 +113,9 @@ class DataFrame(vendored_pandas_frame.DataFrame):
111113
# Must be above 5000 for pandas to delegate to bigframes for binops
112114
__pandas_priority__ = 15000
113115

116+
# Type annotation for anywidget instance
117+
_anywidget_instance: Optional[weakref.ReferenceType["TableWidget"]] = None
118+
114119
def __init__(
115120
self,
116121
data=None,
@@ -776,21 +781,7 @@ def _repr_html_(self) -> str:
776781
if opts.repr_mode == "deferred":
777782
return formatter.repr_query_job(self._compute_dry_run())
778783

779-
if opts.repr_mode == "anywidget":
780-
try:
781-
from bigframes import display
782-
783-
# Store the widget for _repr_mimebundle_ to use
784-
self._anywidget_instance = display.TableWidget(self)
785-
# Return a fallback HTML string
786-
return "Interactive table widget (anywidget mode)"
787-
except (AttributeError, ValueError):
788-
# Fallback if anywidget is not available
789-
warnings.warn(
790-
"Anywidget mode is not available, falling back to deferred mode."
791-
)
792-
return formatter.repr_query_job(self._compute_dry_run())
793-
784+
# Process blob columns first, regardless of display mode
794785
self._cached()
795786
df = self.copy()
796787
if bigframes.options.display.blob_display:
@@ -802,7 +793,40 @@ def _repr_html_(self) -> str:
802793
for col in blob_cols:
803794
# TODO(garrettwu): Not necessary to get access urls for all the rows. Update when having a to get URLs from local data.
804795
df[col] = df[col].blob._get_runtime(mode="R", with_metadata=True)
796+
else:
797+
blob_cols = []
805798

799+
if opts.repr_mode == "anywidget":
800+
try:
801+
from IPython.display import display as ipython_display
802+
803+
from bigframes import display
804+
805+
# Check if widget instance already exists and reuse it
806+
widget = None
807+
if (
808+
hasattr(self, "_anywidget_instance")
809+
and self._anywidget_instance is not None
810+
):
811+
widget = self._anywidget_instance()
812+
813+
# If widget doesn't exist or was garbage collected, create a new one
814+
if widget is None:
815+
# Pass the processed dataframe (with blob URLs) to the widget
816+
widget = display.TableWidget(df)
817+
self._anywidget_instance = weakref.ref(widget)
818+
819+
ipython_display(widget)
820+
return "" # Return empty string since we used display()
821+
822+
except (AttributeError, ValueError, ImportError):
823+
# Fallback if anywidget is not available
824+
warnings.warn(
825+
"Anywidget mode is not available. Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. Falling back to deferred mode."
826+
)
827+
return formatter.repr_query_job(self._compute_dry_run())
828+
829+
# Continue with regular HTML rendering for non-anywidget modes
806830
# TODO(swast): pass max_columns and get the true column count back. Maybe
807831
# get 1 more column than we have requested so that pandas can add the
808832
# ... for us?
@@ -811,7 +835,6 @@ def _repr_html_(self) -> str:
811835
)
812836

813837
self._set_internal_query_job(query_job)
814-
815838
column_count = len(pandas_df.columns)
816839

817840
with display_options.pandas_repr(opts):

bigframes/display/anywidget.py

Lines changed: 41 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,21 @@
1717
from importlib import resources
1818
import functools
1919
import math
20-
from typing import Any, Dict, Iterator, Type, TYPE_CHECKING
20+
from typing import Any, Dict, Iterator, List, Optional, Type
2121
import uuid
2222

2323
import pandas as pd
2424

2525
import bigframes
2626

27-
ANYWIDGET_INSTALLED = True
28-
if TYPE_CHECKING:
27+
# Simplified import structure as suggested in review
28+
try:
2929
import anywidget
3030
import traitlets
31-
else:
32-
try:
33-
import anywidget
34-
import traitlets
35-
except Exception:
36-
ANYWIDGET_INSTALLED = False
31+
32+
ANYWIDGET_INSTALLED = True
33+
except Exception:
34+
ANYWIDGET_INSTALLED = False
3735

3836
WIDGET_BASE: Type[Any]
3937
if ANYWIDGET_INSTALLED:
@@ -48,14 +46,15 @@ class TableWidget(WIDGET_BASE):
4846
"""
4947

5048
def __init__(self, dataframe: bigframes.dataframe.DataFrame):
51-
"""
52-
Initialize the TableWidget.
49+
"""Initialize the TableWidget.
5350
5451
Args:
5552
dataframe: The Bigframes Dataframe to display in the widget.
5653
"""
5754
if not ANYWIDGET_INSTALLED:
58-
raise ImportError("Anywidget is not installed, cannot create TableWidget.")
55+
raise ImportError(
56+
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget."
57+
)
5958

6059
super().__init__()
6160
self._dataframe = dataframe
@@ -65,13 +64,20 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
6564

6665
# Initialize data fetching attributes.
6766
self._batches = dataframe.to_pandas_batches(page_size=self.page_size)
68-
self._cached_data = pd.DataFrame(columns=self._dataframe.columns)
67+
68+
# Use list of DataFrames to avoid memory copies from concatenation
69+
self._cached_batches: List[pd.DataFrame] = []
70+
71+
# Unique identifier for HTML table element
6972
self._table_id = str(uuid.uuid4())
7073
self._all_data_loaded = False
71-
self._batch_iterator: Iterator[pd.DataFrame] | None = None
74+
# Renamed from _batch_iterator to _batch_iter to avoid naming conflict
75+
self._batch_iter: Optional[Iterator[pd.DataFrame]] = None
7276

7377
# len(dataframe) is expensive, since it will trigger a
7478
# SELECT COUNT(*) query. It is a must have however.
79+
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
80+
# before we get here so that the count might already be cached.
7581
self.row_count = len(dataframe)
7682

7783
# get the initial page
@@ -89,14 +95,13 @@ def _esm(self):
8995

9096
@traitlets.validate("page")
9197
def _validate_page(self, proposal: Dict[str, Any]):
92-
"""
93-
Validate and clamp the page number to a valid range.
98+
"""Validate and clamp the page number to a valid range.
9499
95100
Args:
96-
proposal:
97-
A dictionary from the traitlets library containing the proposed
98-
change. The new value is in proposal["value"].
101+
proposal: A dictionary from the traitlets library containing the
102+
proposed change. The new value is in proposal["value"].
99103
"""
104+
100105
value = proposal["value"]
101106
if self.row_count == 0 or self.page_size == 0:
102107
return 0
@@ -120,34 +125,39 @@ def _get_next_batch(self) -> bool:
120125
try:
121126
iterator = self._get_batch_iterator()
122127
batch = next(iterator)
123-
self._cached_data = pd.concat([self._cached_data, batch], ignore_index=True)
128+
self._cached_batches.append(batch)
124129
return True
125130
except StopIteration:
126131
self._all_data_loaded = True
127-
# update row count if we loaded all data
128-
if self.row_count == 0:
129-
self.row_count = len(self._cached_data)
130132
return False
131-
except Exception as e:
132-
raise RuntimeError(f"Error during batch processing: {str(e)}") from e
133133

134134
def _get_batch_iterator(self) -> Iterator[pd.DataFrame]:
135135
"""Lazily initializes and returns the batch iterator."""
136-
if self._batch_iterator is None:
137-
self._batch_iterator = iter(self._batches)
138-
return self._batch_iterator
136+
if self._batch_iter is None:
137+
self._batch_iter = iter(self._batches)
138+
return self._batch_iter
139+
140+
def _get_cached_data(self) -> pd.DataFrame:
141+
"""Combine all cached batches into a single DataFrame."""
142+
if not self._cached_batches:
143+
return pd.DataFrame(columns=self._dataframe.columns)
144+
return pd.concat(self._cached_batches, ignore_index=True)
139145

140146
def _set_table_html(self):
141147
"""Sets the current html data based on the current page and page size."""
142148
start = self.page * self.page_size
143149
end = start + self.page_size
144150

145151
# fetch more data if the requested page is outside our cache
146-
while len(self._cached_data) < end and not self._all_data_loaded:
147-
self._get_next_batch()
152+
cached_data = self._get_cached_data()
153+
while len(cached_data) < end and not self._all_data_loaded:
154+
if self._get_next_batch():
155+
cached_data = self._get_cached_data()
156+
else:
157+
break
148158

149159
# Get the data for the current page
150-
page_data = self._cached_data.iloc[start:end]
160+
page_data = cached_data.iloc[start:end]
151161

152162
# Generate HTML table
153163
self.table_html = page_data.to_html(

0 commit comments

Comments
 (0)