Skip to content

Commit 11daddb

Browse files
committed
apply the workaround to i/O layer
1 parent 233e857 commit 11daddb

File tree

3 files changed

+60
-6150
lines changed

3 files changed

+60
-6150
lines changed

bigframes/core/compile/polars/compiler.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@
4545
polars_installed = True
4646
if TYPE_CHECKING:
4747
import polars as pl
48+
import pyarrow as pa
4849
else:
4950
try:
5051
import bigframes._importing
5152

52-
# Use import_polars() instead of importing directly so that we check
53-
# the version numbers.
5453
pl = bigframes._importing.import_polars()
54+
import pyarrow as pa
5555
except Exception:
5656
polars_installed = False
5757

@@ -409,11 +409,13 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
409409

410410
@compile_op.register(json_ops.ToJSONString)
411411
def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
412-
return input.str.json_decode(pl.String())
412+
# Convert JSON to string representation
413+
return input.cast(pl.String())
413414

414415
@compile_op.register(json_ops.ParseJSON)
415416
def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
416-
return input.str.json_decode(pl.String())
417+
# Parse string as JSON - this should decode, not encode
418+
return input.str.json_decode()
417419

418420
@compile_op.register(json_ops.JSONExtract)
419421
def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr:
@@ -599,9 +601,35 @@ def compile_readlocal(self, node: nodes.ReadLocalNode):
599601
scan_item.source_id: scan_item.id.sql
600602
for scan_item in node.scan_list.items
601603
}
602-
lazy_frame = cast(
603-
pl.DataFrame, pl.from_arrow(node.local_data_source.data)
604-
).lazy()
604+
605+
# Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262
606+
# Convert JSON columns to strings before Polars processing
607+
arrow_data = node.local_data_source.data
608+
schema = arrow_data.schema
609+
610+
# Check if any columns are JSON type
611+
json_field_indices = [
612+
i
613+
for i, field in enumerate(schema)
614+
if pa.types.is_extension_type(field.type)
615+
and field.type.extension_name == "google:sqlType:json"
616+
]
617+
618+
if json_field_indices:
619+
# Convert JSON columns to string columns
620+
new_arrays = []
621+
new_fields = []
622+
for i, field in enumerate(schema):
623+
if i in json_field_indices:
624+
# Cast JSON to string
625+
new_arrays.append(arrow_data.column(i).cast(pa.string()))
626+
new_fields.append(pa.field(field.name, pa.string()))
627+
else:
628+
new_arrays.append(arrow_data.column(i))
629+
new_fields.append(field)
630+
arrow_data = pa.table(new_arrays, schema=pa.schema(new_fields))
631+
632+
lazy_frame = cast(pl.DataFrame, pl.from_arrow(arrow_data)).lazy()
605633
lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read)
606634
if node.offsets_col:
607635
lazy_frame = lazy_frame.with_columns(

bigframes/dataframe.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1965,7 +1965,22 @@ def _to_pandas_batches(
19651965
*,
19661966
allow_large_results: Optional[bool] = None,
19671967
) -> blocks.PandasBatches:
1968-
return self._block.to_pandas_batches(
1968+
# Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262
1969+
# JSON columns are not supported in to_pandas_batches
1970+
json_cols = [
1971+
str(col_name) # Cast to string
1972+
for col_name, dtype in self.dtypes.items()
1973+
if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype)
1974+
]
1975+
1976+
df = self
1977+
if json_cols:
1978+
# Convert JSON columns to strings before materialization
1979+
df = df.copy()
1980+
for col in json_cols:
1981+
df[col] = df[col].astype("string")
1982+
1983+
return df._block.to_pandas_batches(
19691984
page_size=page_size,
19701985
max_results=max_results,
19711986
allow_large_results=allow_large_results,

0 commit comments

Comments
 (0)