diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index a99366ad4c..958113dda3 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -30,6 +30,7 @@ import pyarrow as pa import pyarrow.parquet # type: ignore +from bigframes.core import pyarrow_utils import bigframes.core.schema as schemata import bigframes.dtypes @@ -113,7 +114,9 @@ def to_arrow( schema = self.data.schema if duration_type == "int": schema = _schema_durations_to_ints(schema) - batches = map(functools.partial(_cast_pa_batch, schema=schema), batches) + batches = map( + functools.partial(pyarrow_utils.cast_batch, schema=schema), batches + ) if offsets_col is not None: return schema.append(pa.field(offsets_col, pa.int64())), _append_offsets( @@ -468,14 +471,6 @@ def _schema_durations_to_ints(schema: pa.Schema) -> pa.Schema: ) -# TODO: Use RecordBatch.cast once min pyarrow>=16.0 -def _cast_pa_batch(batch: pa.RecordBatch, schema: pa.Schema) -> pa.RecordBatch: - return pa.record_batch( - [arr.cast(type) for arr, type in zip(batch.columns, schema.types)], - schema=schema, - ) - - def _pairwise(iterable): do_yield = False a = None diff --git a/bigframes/core/pyarrow_utils.py b/bigframes/core/pyarrow_utils.py index 4196e68304..b9dc2ea2b3 100644 --- a/bigframes/core/pyarrow_utils.py +++ b/bigframes/core/pyarrow_utils.py @@ -74,6 +74,16 @@ def chunk_by_row_count( yield buffer.take_as_batches(len(buffer)) +def cast_batch(batch: pa.RecordBatch, schema: pa.Schema) -> pa.RecordBatch: + if batch.schema == schema: + return batch + # TODO: Use RecordBatch.cast once min pyarrow>=16.0 + return pa.record_batch( + [arr.cast(type) for arr, type in zip(batch.columns, schema.types)], + schema=schema, + ) + + def truncate_pyarrow_iterable( batches: Iterable[pa.RecordBatch], max_results: int ) -> Iterator[pa.RecordBatch]: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index b0a31595e5..20f2f5ee12 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -247,6 +247,7 @@ class SimpleDtypeInfo: "decimal128(38, 9)[pyarrow]", "decimal256(76, 38)[pyarrow]", "binary[pyarrow]", + "duration[us][pyarrow]", ] DTYPE_STRINGS = typing.get_args(DtypeString) @@ -421,6 +422,8 @@ def is_bool_coercable(type_: ExpressionType) -> bool: # special case - both "Int64" and "int64[pyarrow]" are accepted BIGFRAMES_STRING_TO_BIGFRAMES["int64[pyarrow]"] = INT_DTYPE +BIGFRAMES_STRING_TO_BIGFRAMES["duration[us][pyarrow]"] = TIMEDELTA_DTYPE + # For the purposes of dataframe.memory_usage DTYPE_BYTE_SIZES = { type_info.dtype: type_info.logical_bytes for type_info in SIMPLE_TYPES diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index c913f39791..cc8f086f9f 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -50,6 +50,7 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: result_rows = 0 for batch in self._arrow_batches: + batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow()) result_rows += batch.num_rows maximum_result_rows = bigframes.options.compute.maximum_result_rows diff --git a/bigframes/testing/utils.py b/bigframes/testing/utils.py index ecf9ae00f8..c3a8008465 100644 --- a/bigframes/testing/utils.py +++ b/bigframes/testing/utils.py @@ -185,6 +185,16 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): "timestamp_col" ] + if not isinstance(df["duration_col"].dtype, pd.ArrowDtype): + df["duration_col"] = df["duration_col"].astype(pd.Int64Dtype()) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["duration_col"]), + schema=pa.schema([("duration_col", pa.duration("us"))]), + ) + df["duration_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ + "duration_col" + ] + # Convert geography types columns. if "geography_col" in df.columns: df["geography_col"] = df["geography_col"].astype( diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 2e5a1499b9..6e591cfa72 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -1,9 +1,9 @@ -{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z"} -{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z"} -{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null} -{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} -{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "LINESTRING(-0.127959 51.507728, -0.127026 51.507473)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} -{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file +{"bool_col": true, "bytes_col": "SGVsbG8sIFdvcmxkIQ==", "date_col": "2021-07-21", "datetime_col": "2021-07-21 11:39:45", "geography_col": "POINT(-122.0838511 37.3860517)", "int64_col": "123456789", "int64_too": "0", "numeric_col": "1.23456789", "float64_col": "1.25", "rowindex": 0, "rowindex_2": 0, "string_col": "Hello, World!", "time_col": "11:41:43.076160", "timestamp_col": "2021-07-21T17:43:43.945289Z", "duration_col": 4} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "1991-02-03", "datetime_col": "1991-01-02 03:45:06", "geography_col": "POINT(-71.104 42.315)", "int64_col": "-987654321", "int64_too": "1", "numeric_col": "1.23456789", "float64_col": "2.51", "rowindex": 1, "rowindex_2": 1, "string_col": "こんにちは", "time_col": "11:14:34.701606", "timestamp_col": "2021-07-21T17:43:43.945289Z", "duration_col": -1000000} +{"bool_col": true, "bytes_col": "wqFIb2xhIE11bmRvIQ==", "date_col": "2023-03-01", "datetime_col": "2023-03-01 10:55:13", "geography_col": "POINT(-0.124474760143016 51.5007826749545)", "int64_col": "314159", "int64_too": "0", "numeric_col": "101.1010101", "float64_col": "2.5e10", "rowindex": 2, "rowindex_2": 2, "string_col": " ¡Hola Mundo! ", "time_col": "23:59:59.999999", "timestamp_col": "2023-03-01T10:55:13.250125Z", "duration_col": 0} +{"bool_col": null, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": null, "int64_too": "1", "numeric_col": null, "float64_col": null, "rowindex": 3, "rowindex_2": 3, "string_col": null, "time_col": null, "timestamp_col": null, "duration_col": null} +{"bool_col": false, "bytes_col": "44GT44KT44Gr44Gh44Gv", "date_col": "2021-07-21", "datetime_col": null, "geography_col": null, "int64_col": "-234892", "int64_too": "-2345", "numeric_col": null, "float64_col": null, "rowindex": 4, "rowindex_2": 4, "string_col": "Hello, World!", "time_col": null, "timestamp_col": null, "duration_col": 31540000000000} +{"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z", "duration_col": 4} +{"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "LINESTRING(-0.127959 51.507728, -0.127026 51.507473)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z", "duration_col": null} +{"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z", "duration_col": 4} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null, "duration_col": 432000000000} diff --git a/tests/data/scalars_schema.json b/tests/data/scalars_schema.json index 1f5d8cdb65..8be4e95228 100644 --- a/tests/data/scalars_schema.json +++ b/tests/data/scalars_schema.json @@ -71,5 +71,11 @@ "mode": "NULLABLE", "name": "timestamp_col", "type": "TIMESTAMP" + }, + { + "mode": "NULLABLE", + "name": "duration_col", + "type": "INTEGER", + "description": "#microseconds" } ] diff --git a/tests/system/small/pandas/core/methods/test_describe.py b/tests/system/small/pandas/core/methods/test_describe.py index dfc7c3fb23..5971e47997 100644 --- a/tests/system/small/pandas/core/methods/test_describe.py +++ b/tests/system/small/pandas/core/methods/test_describe.py @@ -21,7 +21,13 @@ def test_df_describe_non_temporal(scalars_dfs): pytest.importorskip("pandas", minversion="2.0.0") scalars_df, scalars_pandas_df = scalars_dfs # excluding temporal columns here because BigFrames cannot perform percentiles operations on them - unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] + unsupported_columns = [ + "datetime_col", + "timestamp_col", + "time_col", + "date_col", + "duration_col", + ] bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas() modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e8d156538f..5045e2268f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -553,7 +553,7 @@ def test_df_info(scalars_dfs): expected = ( "\n" "Index: 9 entries, 0 to 8\n" - "Data columns (total 13 columns):\n" + "Data columns (total 14 columns):\n" " # Column Non-Null Count Dtype\n" "--- ------------- ---------------- ------------------------------\n" " 0 bool_col 8 non-null boolean\n" @@ -569,8 +569,9 @@ def test_df_info(scalars_dfs): " 10 string_col 8 non-null string\n" " 11 time_col 6 non-null time64[us][pyarrow]\n" " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" - "memory usage: 1269 bytes\n" + " 13 duration_col 7 non-null duration[us][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1341 bytes\n" ) scalars_df, _ = scalars_dfs @@ -1694,6 +1695,7 @@ def test_get_dtypes(scalars_df_default_index): "string_col": pd.StringDtype(storage="pyarrow"), "time_col": pd.ArrowDtype(pa.time64("us")), "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + "duration_col": pd.ArrowDtype(pa.duration("us")), } pd.testing.assert_series_equal( dtypes, @@ -4771,6 +4773,9 @@ def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") + # duration not fully supported at pandas level + scalars_df_index = scalars_df_index.drop(columns="duration_col") + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns="duration_col") with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: scalars_df_index.to_json(bf_result_file, orient="table") # default_handler for arrow types that have no default conversion @@ -4882,6 +4887,7 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): "time_col", "timestamp_col", "geography_col", + "duration_col", ] bf_result_file = tempfile.TemporaryFile() diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index afe3b53d6d..ef6e25a95c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -55,7 +55,7 @@ def test_sql_executes(scalars_df_default_index, bigquery_client): """ # Do some operations to make for more complex SQL. df = ( - scalars_df_default_index.drop(columns=["geography_col"]) + scalars_df_default_index.drop(columns=["geography_col", "duration_col"]) .groupby("string_col") .max() ) @@ -87,7 +87,7 @@ def test_sql_executes_and_includes_named_index( """ # Do some operations to make for more complex SQL. df = ( - scalars_df_default_index.drop(columns=["geography_col"]) + scalars_df_default_index.drop(columns=["geography_col", "duration_col"]) .groupby("string_col") .max() ) @@ -120,7 +120,7 @@ def test_sql_executes_and_includes_named_multiindex( """ # Do some operations to make for more complex SQL. df = ( - scalars_df_default_index.drop(columns=["geography_col"]) + scalars_df_default_index.drop(columns=["geography_col", "duration_col"]) .groupby(["string_col", "bool_col"]) .max() ) @@ -999,14 +999,16 @@ def test_to_sql_query_unnamed_index_included( scalars_df_default_index: bpd.DataFrame, scalars_pandas_df_default_index: pd.DataFrame, ): - bf_df = scalars_df_default_index.reset_index(drop=True) + bf_df = scalars_df_default_index.reset_index(drop=True).drop(columns="duration_col") sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True) assert len(idx_labels) == 1 assert len(idx_ids) == 1 assert idx_labels[0] is None assert idx_ids[0].startswith("bigframes") - pd_df = scalars_pandas_df_default_index.reset_index(drop=True) + pd_df = scalars_pandas_df_default_index.reset_index(drop=True).drop( + columns="duration_col" + ) roundtrip = session.read_gbq(sql, index_col=idx_ids) roundtrip.index.names = [None] utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df, check_index_type=False) @@ -1017,14 +1019,18 @@ def test_to_sql_query_named_index_included( scalars_df_default_index: bpd.DataFrame, scalars_pandas_df_default_index: pd.DataFrame, ): - bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True) + bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True).drop( + columns="duration_col" + ) sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True) assert len(idx_labels) == 1 assert len(idx_ids) == 1 assert idx_labels[0] == "rowindex_2" assert idx_ids[0] == "rowindex_2" - pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) + pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True).drop( + columns="duration_col" + ) roundtrip = session.read_gbq(sql, index_col=idx_ids) utils.assert_pandas_df_equal(roundtrip.to_pandas(), pd_df) @@ -1034,12 +1040,14 @@ def test_to_sql_query_unnamed_index_excluded( scalars_df_default_index: bpd.DataFrame, scalars_pandas_df_default_index: pd.DataFrame, ): - bf_df = scalars_df_default_index.reset_index(drop=True) + bf_df = scalars_df_default_index.reset_index(drop=True).drop(columns="duration_col") sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False) assert len(idx_labels) == 0 assert len(idx_ids) == 0 - pd_df = scalars_pandas_df_default_index.reset_index(drop=True) + pd_df = scalars_pandas_df_default_index.reset_index(drop=True).drop( + columns="duration_col" + ) roundtrip = session.read_gbq(sql) utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True @@ -1051,14 +1059,18 @@ def test_to_sql_query_named_index_excluded( scalars_df_default_index: bpd.DataFrame, scalars_pandas_df_default_index: pd.DataFrame, ): - bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True) + bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True).drop( + columns="duration_col" + ) sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False) assert len(idx_labels) == 0 assert len(idx_ids) == 0 - pd_df = scalars_pandas_df_default_index.set_index( - "rowindex_2", drop=True - ).reset_index(drop=True) + pd_df = ( + scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) + .reset_index(drop=True) + .drop(columns="duration_col") + ) roundtrip = session.read_gbq(sql) utils.assert_pandas_df_equal( roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 809d08c6c1..4bb1c6589a 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -54,7 +54,13 @@ def df_and_local_csv(scalars_df_index): # The auto detects of BigQuery load job have restrictions to detect the bytes, # datetime, numeric and geometry types, so they're skipped here. - drop_columns = ["bytes_col", "datetime_col", "numeric_col", "geography_col"] + drop_columns = [ + "bytes_col", + "datetime_col", + "numeric_col", + "geography_col", + "duration_col", + ] scalars_df_index = scalars_df_index.drop(columns=drop_columns) with tempfile.TemporaryDirectory() as dir: @@ -68,7 +74,13 @@ def df_and_local_csv(scalars_df_index): def df_and_gcs_csv(scalars_df_index, gcs_folder): # The auto detects of BigQuery load job have restrictions to detect the bytes, # datetime, numeric and geometry types, so they're skipped here. - drop_columns = ["bytes_col", "datetime_col", "numeric_col", "geography_col"] + drop_columns = [ + "bytes_col", + "datetime_col", + "numeric_col", + "geography_col", + "duration_col", + ] scalars_df_index = scalars_df_index.drop(columns=drop_columns) path = gcs_folder + "test_read_csv_w_gcs_csv*.csv" @@ -1808,6 +1820,7 @@ def test_read_parquet_gcs( df_out = df_out.assign( datetime_col=df_out["datetime_col"].astype("timestamp[us][pyarrow]"), timestamp_col=df_out["timestamp_col"].astype("timestamp[us, tz=UTC][pyarrow]"), + duration_col=df_out["duration_col"].astype("duration[us][pyarrow]"), ) # Make sure we actually have at least some values before comparing. @@ -1856,7 +1869,8 @@ def test_read_parquet_gcs_compressed( # DATETIME gets loaded as TIMESTAMP in parquet. See: # https://cloud.google.com/bigquery/docs/exporting-data#parquet_export_details df_out = df_out.assign( - datetime_col=df_out["datetime_col"].astype("timestamp[us][pyarrow]") + datetime_col=df_out["datetime_col"].astype("timestamp[us][pyarrow]"), + duration_col=df_out["duration_col"].astype("duration[us][pyarrow]"), ) # Make sure we actually have at least some values before comparing. @@ -1914,9 +1928,23 @@ def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): # The auto detects of BigQuery load job have restrictions to detect the bytes, # datetime, numeric and geometry types, so they're skipped here. - df = df.drop(columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"]) + df = df.drop( + columns=[ + "bytes_col", + "datetime_col", + "numeric_col", + "geography_col", + "duration_col", + ] + ) scalars_df = scalars_df.drop( - columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] + columns=[ + "bytes_col", + "datetime_col", + "numeric_col", + "geography_col", + "duration_col", + ] ) assert df.shape[0] == scalars_df.shape[0] pd.testing.assert_series_equal( @@ -1949,8 +1977,10 @@ def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): # The auto detects of BigQuery load job have restrictions to detect the bytes, # numeric and geometry types, so they're skipped here. - df = df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) - scalars_df = scalars_df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) + df = df.drop(columns=["bytes_col", "numeric_col", "geography_col", "duration_col"]) + scalars_df = scalars_df.drop( + columns=["bytes_col", "numeric_col", "geography_col", "duration_col"] + ) # pandas read_json does not respect the dtype overrides for these columns df = df.drop(columns=["date_col", "datetime_col", "time_col"]) diff --git a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py index 7307fd9b4e..6f8a2050e5 100644 --- a/tests/unit/core/compile/sqlglot/test_compile_readlocal.py +++ b/tests/unit/core/compile/sqlglot/test_compile_readlocal.py @@ -24,6 +24,8 @@ def test_compile_readlocal( scalar_types_pandas_df: pd.DataFrame, compiler_session: bigframes.Session, snapshot ): + # Durations not yet supported + scalar_types_pandas_df = scalar_types_pandas_df.drop(["duration_col"], axis=1) bf_df = bpd.DataFrame(scalar_types_pandas_df, session=compiler_session) snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py index f7f0cc80bb..467cf7ce3d 100644 --- a/tests/unit/test_dataframe_polars.py +++ b/tests/unit/test_dataframe_polars.py @@ -508,7 +508,7 @@ def test_df_info(scalars_dfs): expected = ( "\n" "Index: 9 entries, 0 to 8\n" - "Data columns (total 13 columns):\n" + "Data columns (total 14 columns):\n" " # Column Non-Null Count Dtype\n" "--- ------------- ---------------- ------------------------------\n" " 0 bool_col 8 non-null boolean\n" @@ -524,8 +524,9 @@ def test_df_info(scalars_dfs): " 10 string_col 8 non-null string\n" " 11 time_col 6 non-null time64[us][pyarrow]\n" " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" - "memory usage: 1269 bytes\n" + " 13 duration_col 7 non-null duration[us][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1341 bytes\n" ) scalars_df, _ = scalars_dfs @@ -4086,6 +4087,9 @@ def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") + # duration not fully supported at pandas level + scalars_df_index = scalars_df_index.drop(columns="duration_col") + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns="duration_col") with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: scalars_df_index.to_json(bf_result_file, orient="table") # default_handler for arrow types that have no default conversion @@ -4197,6 +4201,7 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): "time_col", "timestamp_col", "geography_col", + "duration_col", ] bf_result_file = tempfile.TemporaryFile()