Skip to content

Commit 9f30c18

Browse files
1 parent 7a8e622 commit 9f30c18

File tree

3 files changed

+45
-22
lines changed

3 files changed

+45
-22
lines changed

‎bigframes/pandas/__init__.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from bigframes.pandas.core.api import to_timedelta
4040
from bigframes.pandas.io.api import (
4141
from_glob_path,
42+
read_arrow,
4243
read_csv,
4344
read_gbq,
4445
read_gbq_function,
@@ -50,7 +51,6 @@
5051
read_pandas,
5152
read_parquet,
5253
read_pickle,
53-
read_arrow,
5454
)
5555
import bigframes.series
5656
import bigframes.session

‎bigframes/session/__init__.py‎

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,7 +1031,10 @@ def _read_pandas(
10311031
if is_inline:
10321032
if final_engine == "bigquery_inline":
10331033
# Ensure inline data isn't too large if specified directly
1034-
if pandas_dataframe.memory_usage(deep=True).sum() > bigframes.constants.MAX_INLINE_BYTES:
1034+
if (
1035+
pandas_dataframe.memory_usage(deep=True).sum()
1036+
> bigframes.constants.MAX_INLINE_BYTES
1037+
):
10351038
raise ValueError(
10361039
f"DataFrame size ({pandas_dataframe.memory_usage(deep=True).sum()} bytes) "
10371040
f"exceeds the maximum allowed for inline data "
@@ -1058,11 +1061,10 @@ def _read_pandas_inline(
10581061
local_block = blocks.Block.from_local(pandas_dataframe, self)
10591062
return dataframe.DataFrame(local_block)
10601063

1061-
def _read_arrow_inline(
1062-
self, arrow_table: pyarrow.Table
1063-
) -> dataframe.DataFrame:
1064+
def _read_arrow_inline(self, arrow_table: pyarrow.Table) -> dataframe.DataFrame:
10641065
"""Creates a BigFrames DataFrame from an in-memory pyarrow Table by inlining data."""
10651066
import bigframes.dataframe as dataframe
1067+
10661068
# Assuming Block.from_local can handle pandas DataFrame.
10671069
# If Block.from_local is enhanced to take pyarrow.Table directly,
10681070
# this conversion can be removed.

‎tests/system/small/test_read_arrow.py‎

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@ def test_read_arrow_basic(session):
3434
pa.array([0.1, 0.2, 0.3], type=pa.float64()),
3535
pa.array(["foo", "bar", "baz"], type=pa.string()),
3636
]
37-
arrow_table = pa.Table.from_arrays(
38-
data, names=["ints", "floats", "strings"]
39-
)
37+
arrow_table = pa.Table.from_arrays(data, names=["ints", "floats", "strings"])
4038

4139
bf_df = bpd.read_arrow(arrow_table)
4240

@@ -142,13 +140,17 @@ def test_read_arrow_all_types(session):
142140
bf_pd_df = bf_df.to_pandas()
143141

144142
for col in ["int_col", "float_col"]:
145-
bf_pd_df[col] = bf_pd_df[col].astype(pd_expected[col].dtype)
143+
bf_pd_df[col] = bf_pd_df[col].astype(pd_expected[col].dtype)
146144

147145
bf_pd_df["str_col"] = bf_pd_df["str_col"].astype(pandas.ArrowDtype(pa.string()))
148146
bf_pd_df["ts_col"] = pandas.to_datetime(bf_pd_df["ts_col"], utc=True)
149-
bf_pd_df["date_col"] = bf_pd_df["date_col"].apply(lambda x: x.date() if hasattr(x, 'date') and x is not pandas.NaT else x)
147+
bf_pd_df["date_col"] = bf_pd_df["date_col"].apply(
148+
lambda x: x.date() if hasattr(x, "date") and x is not pandas.NaT else x
149+
)
150150
bf_pd_df["bool_col"] = bf_pd_df["bool_col"].astype(pandas.ArrowDtype(pa.bool_()))
151-
pd_expected["bool_col"] = pd_expected["bool_col"].astype(pandas.ArrowDtype(pa.bool_()))
151+
pd_expected["bool_col"] = pd_expected["bool_col"].astype(
152+
pandas.ArrowDtype(pa.bool_())
153+
)
152154

153155
pandas.testing.assert_frame_equal(
154156
bf_pd_df, pd_expected, check_dtype=False, rtol=1e-5
@@ -193,10 +195,18 @@ def test_read_arrow_list_types(session):
193195
bf_pd_df = bf_df.to_pandas()
194196

195197
# Explicitly cast to ArrowDtype for comparison as pandas might default to object
196-
pd_expected["list_int_col"] = pd_expected["list_int_col"].astype(pandas.ArrowDtype(pa.list_(pa.int64())))
197-
pd_expected["list_str_col"] = pd_expected["list_str_col"].astype(pandas.ArrowDtype(pa.list_(pa.string())))
198-
bf_pd_df["list_int_col"] = bf_pd_df["list_int_col"].astype(pandas.ArrowDtype(pa.list_(pa.int64())))
199-
bf_pd_df["list_str_col"] = bf_pd_df["list_str_col"].astype(pandas.ArrowDtype(pa.list_(pa.string())))
198+
pd_expected["list_int_col"] = pd_expected["list_int_col"].astype(
199+
pandas.ArrowDtype(pa.list_(pa.int64()))
200+
)
201+
pd_expected["list_str_col"] = pd_expected["list_str_col"].astype(
202+
pandas.ArrowDtype(pa.list_(pa.string()))
203+
)
204+
bf_pd_df["list_int_col"] = bf_pd_df["list_int_col"].astype(
205+
pandas.ArrowDtype(pa.list_(pa.int64()))
206+
)
207+
bf_pd_df["list_str_col"] = bf_pd_df["list_str_col"].astype(
208+
pandas.ArrowDtype(pa.list_(pa.string()))
209+
)
200210

201211
pandas.testing.assert_frame_equal(bf_pd_df, pd_expected, check_dtype=True)
202212

@@ -214,7 +224,9 @@ def test_read_arrow_engine_streaming(session):
214224
assert str(bf_df.dtypes["event"]) == "string[pyarrow]"
215225
pd_expected = arrow_table.to_pandas()
216226
bf_pd_df = bf_df.to_pandas()
217-
pandas.testing.assert_frame_equal(bf_pd_df.astype(pd_expected.dtypes), pd_expected, check_dtype=False)
227+
pandas.testing.assert_frame_equal(
228+
bf_pd_df.astype(pd_expected.dtypes), pd_expected, check_dtype=False
229+
)
218230

219231

220232
def test_read_arrow_engine_write(session):
@@ -230,7 +242,9 @@ def test_read_arrow_engine_write(session):
230242
assert str(bf_df.dtypes["status"]) == "string[pyarrow]"
231243
pd_expected = arrow_table.to_pandas()
232244
bf_pd_df = bf_df.to_pandas()
233-
pandas.testing.assert_frame_equal(bf_pd_df.astype(pd_expected.dtypes), pd_expected, check_dtype=False)
245+
pandas.testing.assert_frame_equal(
246+
bf_pd_df.astype(pd_expected.dtypes), pd_expected, check_dtype=False
247+
)
234248

235249

236250
def test_read_arrow_no_columns_empty_rows(session):
@@ -241,7 +255,14 @@ def test_read_arrow_no_columns_empty_rows(session):
241255

242256

243257
def test_read_arrow_special_column_names(session):
244-
col_names = ["col with space", "col/slash", "col.dot", "col:colon", "col(paren)", "col[bracket]"]
258+
col_names = [
259+
"col with space",
260+
"col/slash",
261+
"col.dot",
262+
"col:colon",
263+
"col(paren)",
264+
"col[bracket]",
265+
]
245266
# BigQuery normalizes column names by replacing special characters with underscores.
246267
# Exception: dots are not allowed and usually cause errors or are handled by specific client libraries.
247268
# BigFrames aims to map to valid BigQuery column names.
@@ -267,10 +288,10 @@ def test_read_arrow_special_column_names(session):
267288
expected_bq_names = [
268289
"col_with_space",
269290
"col_slash",
270-
"col_dot", # BQ might error on dots or replace them. Let's assume replacement for now.
291+
"col_dot", # BQ might error on dots or replace them. Let's assume replacement for now.
271292
"col_colon",
272293
"col_paren_",
273-
"col_bracket_"
294+
"col_bracket_",
274295
]
275296
# Update: Based on typical BigQuery behavior, dots are not allowed.
276297
# However, BigFrames might handle this by replacing dots with underscores before sending to BQ,
@@ -292,8 +313,8 @@ def test_read_arrow_special_column_names(session):
292313
# And arrow_table.to_pandas() will use the original names.
293314
# We then rename bf_pd_df columns to match pd_expected for data comparison.
294315

295-
pd_expected = arrow_table.to_pandas() # Has original names
296-
bf_pd_df = bf_df.to_pandas() # Has BQ/BF names
316+
pd_expected = arrow_table.to_pandas() # Has original names
317+
bf_pd_df = bf_df.to_pandas() # Has BQ/BF names
297318

298319
assert len(bf_pd_df.columns) == len(pd_expected.columns)
299320

0 commit comments

Comments
 (0)