Skip to content

Commit ffc6892

Browse files
authored
Fix: Trailing empty cell in date column forces no serial_date_to_datetime (#649)
Trailing empty cell in date column forces no serial_date_to_datetime issue fixed
1 parent 1f1cda2 commit ffc6892

File tree

3 files changed

+99
-11
lines changed

3 files changed

+99
-11
lines changed

sources/google_sheets/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ When source detects any problems with headers or table layout **it will issue a
3535
`dlt` normalizer will use first row of data to infer types and will try to coerce following rows - creating variant columns if that is not possible. This is a standard behavior.
3636
**date time** and **date** types are also recognized and this happens via additional metadata that is retrieved for the first row.
3737

38+
> For **date time** or **date** columns, provide a type hint if the first row is empty to ensure values are serialized correctly.
39+
3840
## Passing the spreadsheet id/url and explicit range names
3941
You can use both url of your spreadsheet that you can copy from the browser ie.
4042
```

sources/google_sheets/helpers/data_processing.py

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
"""This is a helper module that contains function which validate and process data"""
22

33
import re
4-
from typing import Any, Iterator, List, Tuple, Union, NamedTuple
4+
from typing import Any, Iterator, List, Tuple, Union, NamedTuple, Optional
55

66
import dlt
77
from dlt.common import logger, pendulum
8-
from dlt.common.typing import DictStrAny
8+
from dlt.common.typing import DictStrAny, TTableHintTemplate
99
from dlt.common.data_types import TDataType
10+
from dlt.common.schema.typing import TTableSchemaColumns
1011

1112
# this string comes before the id
1213
URL_ID_IDENTIFIER = "d"
@@ -229,8 +230,52 @@ def serial_date_to_datetime(
229230
return conv_datetime
230231

231232

233+
def _handle_possibly_date_vals(
234+
val: Union[int, float],
235+
col_name: str,
236+
type_from_metadata: Optional[TDataType],
237+
col_hints: Optional[TTableHintTemplate[TTableSchemaColumns]],
238+
) -> Any:
239+
"""
240+
Convert numeric spreadsheet values to dates/timestamps when appropriate.
241+
1. if val is a boolean, return it unchanged
242+
2. if type_from_metadata is "timestamp" or "date", convert using serial_date_to_datetime
243+
3. if type_from_metadata is not provided, look up type hint in col_hints
244+
and if "timestamp" or "date", convert using serial_date_to_datetime
245+
4. otherwise return unchanged
246+
247+
Args:
248+
val (Union[int, float]): numeric cell value
249+
col_name (str): name of the header the cell value corresponds to
250+
type_from_metadata (Optional[TDataType]): "timestamp", "date" or None based on first row under the header
251+
col_hints (Optional[TTableHintTemplate[TTableSchemaColumns]]): Column hints, possibly with date/timestamp data type hints
252+
253+
Yields:
254+
Any: The converted datetime object, or the original value.
255+
"""
256+
# bool is a subclass of int, no additional processing needed
257+
if isinstance(val, bool):
258+
return val
259+
# data type is provided from the metadata
260+
if type_from_metadata in ["timestamp", "date"]:
261+
return serial_date_to_datetime(val, type_from_metadata)
262+
# if no type is provided from the metadata, check col hints
263+
if type_from_metadata is None:
264+
# we only use non dynamic hints that are dicts
265+
if not col_hints or not isinstance(col_hints, dict):
266+
return val
267+
col_schema = col_hints.get(col_name, None)
268+
if col_schema:
269+
data_type = col_schema.get("data_type")
270+
if data_type in ["timestamp", "date"]:
271+
return serial_date_to_datetime(val, data_type)
272+
return val
273+
274+
232275
def process_range(
233-
sheet_values: List[List[Any]], headers: List[str], data_types: List[TDataType]
276+
sheet_values: List[List[Any]],
277+
headers: List[str],
278+
data_types: List[TDataType],
234279
) -> Iterator[DictStrAny]:
235280
"""
236281
Yields lists of values as dictionaries, converts data times and handles empty rows and cells. Please note:
@@ -241,11 +286,14 @@ def process_range(
241286
Args:
242287
sheet_val (List[List[Any]]): range values without the header row
243288
headers (List[str]): names of the headers
244-
data_types: List[TDataType]: "timestamp" and "date" or None for each column
289+
data_types (List[TDataType]): "timestamp" and "date" or None for each column
245290
246291
Yields:
247292
DictStrAny: A dictionary version of the table. It generates a dictionary of the type {header: value} for every row.
248293
"""
294+
# col_hints are used in case the data type was not produced due to empty traling columns
295+
current_resource = dlt.current.source().resources[dlt.current.resource_name()]
296+
col_hints = current_resource.columns
249297

250298
for row in sheet_values:
251299
# empty row; skip
@@ -261,12 +309,8 @@ def process_range(
261309
# handle null values properly. Null cell values are returned as empty strings, this will cause dlt to create new columns and fill them with empty strings
262310
if val == "":
263311
fill_val = None
264-
elif data_type in ["timestamp", "date"]:
265-
# the datetimes are inferred from first row of data. if next rows have inconsistent data types - pass the values to dlt to deal with them
266-
if not isinstance(val, (int, float)) or isinstance(val, bool):
267-
fill_val = val
268-
else:
269-
fill_val = serial_date_to_datetime(val, data_type)
312+
elif isinstance(val, (int, float)):
313+
fill_val = _handle_possibly_date_vals(val, header, data_type, col_hints)
270314
else:
271315
fill_val = val
272316
table_dict[header] = fill_val

tests/google_sheets/test_google_sheets_source.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
import logging
21
from typing import Tuple
32

43
import pytest
54
import dlt
5+
from datetime import date # noqa: I251
66
from dlt.common.pipeline import LoadInfo
77
from sources.google_sheets import google_spreadsheet
88
from tests.utils import (
@@ -37,6 +37,7 @@
3737
"trailing_empty_cols_1",
3838
"trailing_empty_cols_2",
3939
"trailing_empty_cols_3",
40+
"trailing_empty_col_date",
4041
}
4142

4243
SKIPPED_RANGES = {
@@ -73,6 +74,7 @@
7374
"trailing_empty_cols_1",
7475
"trailing_empty_cols_2",
7576
"trailing_empty_cols_3",
77+
"trailing_empty_col_date",
7678
}
7779

7880

@@ -763,6 +765,46 @@ def test_trailing_empty_cols() -> None:
763765
assert rows == expected_rows
764766

765767

768+
@pytest.mark.parametrize("with_hints", [True, False])
769+
def test_trailing_empty_col_date(with_hints: bool) -> None:
770+
pipeline = dlt.pipeline(
771+
destination="duckdb",
772+
dev_mode=True,
773+
dataset_name="test_trailing_empty_col_date",
774+
)
775+
data = google_spreadsheet(
776+
"1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580",
777+
range_names=["trailing_empty_cols_1", "trailing_empty_col_date"],
778+
get_named_ranges=False,
779+
)
780+
if with_hints:
781+
data.trailing_empty_col_date.apply_hints(
782+
columns={"Start Date": {"data_type": "date"}}
783+
)
784+
info = pipeline.run(data)
785+
assert_load_info(info)
786+
787+
assert "trailing_empty_col_date" in pipeline.default_schema.tables
788+
assert set(
789+
pipeline.default_schema.get_table_columns("trailing_empty_col_date").keys()
790+
) == {"start_date", "end_date", "text", "_dlt_id", "_dlt_load_id"}
791+
792+
expected = [
793+
(None, date(2027, 4, 12), "blablabla"),
794+
(
795+
date(2028, 4, 12) if with_hints else 46855,
796+
date(2027, 4, 12),
797+
"43432",
798+
),
799+
]
800+
with pipeline.sql_client() as c:
801+
sql_query = f"SELECT start_date, end_date, text FROM {pipeline.dataset_name}.trailing_empty_col_date;"
802+
with c.execute_query(sql_query) as cur:
803+
rows = list(cur.fetchall())
804+
assert len(rows) == 2
805+
assert rows == expected
806+
807+
766808
def _row_helper(row, destination_name):
767809
"""
768810
Helper, unpacks the rows from different databases (Bigquery, Postgres, Redshift) to a tuple

0 commit comments

Comments
 (0)