diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 13db6823c1..ab2ac226a7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1106,15 +1106,6 @@ def _read_csv_w_bigquery_engine( if dtype is not None and not utils.is_dict_like(dtype): raise ValueError("dtype should be a dict-like object.") - if names is not None: - if len(names) != len(set(names)): - raise ValueError("Duplicated names are not allowed.") - if not ( - bigframes.core.utils.is_list_like(names, allow_sets=False) - or isinstance(names, abc.KeysView) - ): - raise ValueError("Names should be an ordered collection.") - if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") @@ -1143,16 +1134,31 @@ def _read_csv_w_bigquery_engine( job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.CSV - job_config.autodetect = True job_config.field_delimiter = sep job_config.encoding = encoding job_config.labels = {"bigframes-api": "read_csv"} - # b/409070192: When header > 0, pandas and BigFrames returns different column naming. + if names is not None: + if len(names) != len(set(names)): + raise ValueError("Duplicated names are not allowed.") + if not ( + bigframes.core.utils.is_list_like(names, allow_sets=False) + or isinstance(names, abc.KeysView) + ): + raise ValueError("Names should be an ordered collection.") + else: + # names = None + names = self._try_to_get_names_from_pandas_read_csv( + filepath_or_buffer, + header=header, + index_col=index_col, + usecols=usecols, + ) # We want to match pandas behavior. If header is 0, no rows should be skipped, so we # do not need to set `skip_leading_rows`. If header is None, then there is no header. # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows. + job_config.autodetect = True if header is None: job_config.skip_leading_rows = 0 elif header > 0: @@ -1173,6 +1179,52 @@ def _read_csv_w_bigquery_engine( df[column] = df[column].astype(dtype) return df + def _try_to_get_names_from_pandas_read_csv( + self, + filepath_or_buffer, + *, + header, + index_col, + usecols, + ) -> Optional[Sequence[str]]: + """Attempts to infer column names from a CSV file using pandas. + + This method uses `pandas.read_csv` to preview the column names. It is + neccesary because BigQuery's CSV loading behavior can differ from pandas: + - b/409070192: When `header > 0`, BigQuery generate names based on data type + (e.g., bool_field_0, string_field_1, etc.), while pandas use the literal + content of the specified header row for column names. + - b/324483018: BigQuery engine can throw error when column header in the CSV + file is not a valid BigQuery identifier (e.g., contains spaces, special characters) + """ + # Cannot infer names if index_col and usecols are specified. + if not (index_col is None or index_col == ()) or usecols is not None: + return None + + try: + # Read only the header by setting nrows=0 for efficiency. + pandas_df = pandas.read_csv( + filepath_or_buffer, + header=header, + # Only get the header. + nrows=0, + ) + columns = pandas_df.columns.tolist() + return columns if columns else None + except Exception: + msg = bfe.format_message( + "Could not infer column names with pandas; " + "The BigQuery backend will attempt to infer them instead. " + ) + warnings.warn(msg, UserWarning) + finally: + if not isinstance(filepath_or_buffer, str): + # If the buffer is a BytesIO, we need to reset it to the beginning + # so that it can be read again later. + filepath_or_buffer.seek(0) + + return None + def read_pickle( self, filepath_or_buffer: FilePath | ReadPickleBuffer, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 809d08c6c1..3a0ffd9b81 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -1568,14 +1568,12 @@ def test_read_csv_for_gcs_file_w_header(session, df_and_gcs_csv, header): # Compares results for pandas and bigframes engines scalars_df, path = df_and_gcs_csv bf_df = session.read_csv(path, engine="bigquery", index_col=False, header=header) - pd_df = session.read_csv( - path, index_col=False, header=header, dtype=scalars_df.dtypes.to_dict() - ) - # b/408461403: workaround the issue where the slice does not work for DataFrame. - expected_df = session.read_pandas(scalars_df.to_pandas()[header:]) + dtypes = {col: dtype for col, dtype in zip(bf_df.columns, bf_df.dtypes.to_list())} + pd_df = session.read_csv(path, index_col=False, header=header, dtype=dtypes) - assert pd_df.shape[0] == expected_df.shape[0] + expected_df = scalars_df[header:] + assert pd_df.shape[0] == scalars_df[header:].shape[0] assert bf_df.shape[0] == pd_df.shape[0] # We use a default index because of index_col=False, so the previous index @@ -1583,17 +1581,11 @@ def test_read_csv_for_gcs_file_w_header(session, df_and_gcs_csv, header): assert len(pd_df.columns) == len(expected_df.columns) + 1 assert len(bf_df.columns) == len(pd_df.columns) - # When `header > 0`, pandas and BigFrames may handle column naming differently. - # Pandas uses the literal content of the specified header row for column names, - # regardless of what it is. BigQuery, however, might generate default names based - # on data type (e.g.,bool_field_0,string_field_1, etc.). - if header == 0: - # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs - # (b/280889935) or guarantee row ordering. - bf_df = bf_df.set_index("rowindex").sort_index() - pd_df = pd_df.set_index("rowindex") - pd.testing.assert_frame_equal(bf_df.to_pandas(), scalars_df.to_pandas()) - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) + # BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs + # (b/280889935) or guarantee row ordering. + bf_df = bf_df.set_index(pd_df.columns[0]).sort_index() + pd_df = pd_df.set_index(pd_df.columns[0]) + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas()) def test_read_csv_w_usecols(session, df_and_local_csv):