Skip to content

Commit 00e2eb2

Browse files
committed
add nanoseconds support to transforms
1 parent da6de99 commit 00e2eb2

File tree

4 files changed

+102
-21
lines changed

4 files changed

+102
-21
lines changed

pyiceberg/transforms.py

Lines changed: 65 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,9 @@
7373
IntegerType,
7474
LongType,
7575
StringType,
76+
TimestampNanoType,
7677
TimestampType,
78+
TimestamptzNanoType,
7779
TimestamptzType,
7880
TimeType,
7981
UUIDType,
@@ -457,13 +459,20 @@ def year_func(v: Any) -> int:
457459

458460
return datetime.micros_to_years(v)
459461

462+
elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
463+
464+
def year_func(v: Any) -> int:
465+
# python datetime has no nanoseconds support.
466+
# nanosecond datetimes will be expressed as int as a workaround
467+
return datetime.nanos_to_years(v)
468+
460469
else:
461470
raise ValueError(f"Cannot apply year transform for type: {source}")
462471

463472
return lambda v: year_func(v) if v is not None else None
464473

465474
def can_transform(self, source: IcebergType) -> bool:
466-
return isinstance(source, (DateType, TimestampType, TimestamptzType))
475+
return isinstance(source, (DateType, TimestampType, TimestamptzType, TimestampNanoType, TimestamptzNanoType))
467476

468477
@property
469478
def granularity(self) -> TimeResolution:
@@ -481,15 +490,19 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr
481490
import pyarrow.compute as pc
482491

483492
if isinstance(source, DateType):
484-
epoch = datetime.EPOCH_DATE
493+
epoch = pa.scalar(datetime.EPOCH_DATE)
485494
elif isinstance(source, TimestampType):
486-
epoch = datetime.EPOCH_TIMESTAMP
495+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
487496
elif isinstance(source, TimestamptzType):
488-
epoch = datetime.EPOCH_TIMESTAMPTZ
497+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
498+
elif isinstance(source, TimestampNanoType):
499+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
500+
elif isinstance(source, TimestamptzNanoType):
501+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
489502
else:
490503
raise ValueError(f"Cannot apply year transform for type: {source}")
491504

492-
return lambda v: pc.years_between(pa.scalar(epoch), v) if v is not None else None
505+
return lambda v: pc.years_between(epoch, v) if v is not None else None
493506

494507

495508
class MonthTransform(TimeTransform[S]):
@@ -520,13 +533,20 @@ def month_func(v: Any) -> int:
520533

521534
return datetime.micros_to_months(v)
522535

536+
elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
537+
538+
def month_func(v: Any) -> int:
539+
# python datetime has no nanoseconds support.
540+
# nanosecond datetimes will be expressed as int as a workaround
541+
return datetime.nanos_to_months(v)
542+
523543
else:
524544
raise ValueError(f"Cannot apply month transform for type: {source}")
525545

526546
return lambda v: month_func(v) if v is not None else None
527547

528548
def can_transform(self, source: IcebergType) -> bool:
529-
return isinstance(source, (DateType, TimestampType, TimestamptzType))
549+
return isinstance(source, (DateType, TimestampType, TimestamptzType, TimestampNanoType, TimestamptzNanoType))
530550

531551
@property
532552
def granularity(self) -> TimeResolution:
@@ -544,17 +564,21 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr
544564
import pyarrow.compute as pc
545565

546566
if isinstance(source, DateType):
547-
epoch = datetime.EPOCH_DATE
567+
epoch = pa.scalar(datetime.EPOCH_DATE)
548568
elif isinstance(source, TimestampType):
549-
epoch = datetime.EPOCH_TIMESTAMP
569+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
550570
elif isinstance(source, TimestamptzType):
551-
epoch = datetime.EPOCH_TIMESTAMPTZ
571+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
572+
elif isinstance(source, TimestampNanoType):
573+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
574+
elif isinstance(source, TimestamptzNanoType):
575+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
552576
else:
553577
raise ValueError(f"Cannot apply month transform for type: {source}")
554578

555579
def month_func(v: pa.Array) -> pa.Array:
556580
return pc.add(
557-
pc.multiply(pc.years_between(pa.scalar(epoch), v), pa.scalar(12)),
581+
pc.multiply(pc.years_between(epoch, v), pa.scalar(12)),
558582
pc.add(pc.month(v), pa.scalar(-1)),
559583
)
560584

@@ -589,13 +613,20 @@ def day_func(v: Any) -> int:
589613

590614
return datetime.micros_to_days(v)
591615

616+
elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
617+
618+
def day_func(v: Any) -> int:
619+
# python datetime has no nanoseconds support.
620+
# nanosecond datetimes will be expressed as int as a workaround
621+
return datetime.nanos_to_days(v)
622+
592623
else:
593624
raise ValueError(f"Cannot apply day transform for type: {source}")
594625

595626
return lambda v: day_func(v) if v is not None else None
596627

597628
def can_transform(self, source: IcebergType) -> bool:
598-
return isinstance(source, (DateType, TimestampType, TimestamptzType))
629+
return isinstance(source, (DateType, TimestampType, TimestamptzType, TimestampNanoType, TimestamptzNanoType))
599630

600631
def result_type(self, source: IcebergType) -> IcebergType:
601632
"""Return the result type of a day transform.
@@ -621,15 +652,19 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr
621652
import pyarrow.compute as pc
622653

623654
if isinstance(source, DateType):
624-
epoch = datetime.EPOCH_DATE
655+
epoch = pa.scalar(datetime.EPOCH_DATE)
625656
elif isinstance(source, TimestampType):
626-
epoch = datetime.EPOCH_TIMESTAMP
657+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
627658
elif isinstance(source, TimestamptzType):
628-
epoch = datetime.EPOCH_TIMESTAMPTZ
659+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
660+
elif isinstance(source, TimestampNanoType):
661+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
662+
elif isinstance(source, TimestamptzNanoType):
663+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
629664
else:
630665
raise ValueError(f"Cannot apply day transform for type: {source}")
631666

632-
return lambda v: pc.days_between(pa.scalar(epoch), v) if v is not None else None
667+
return lambda v: pc.days_between(epoch, v) if v is not None else None
633668

634669

635670
class HourTransform(TimeTransform[S]):
@@ -652,13 +687,20 @@ def hour_func(v: Any) -> int:
652687

653688
return datetime.micros_to_hours(v)
654689

690+
elif isinstance(source, (TimestampNanoType, TimestamptzNanoType)):
691+
692+
def day_func(v: Any) -> int:
693+
# python datetime has no nanoseconds support.
694+
# nanosecond datetimes will be expressed as int as a workaround
695+
return datetime.nanos_to_hours(v)
696+
655697
else:
656698
raise ValueError(f"Cannot apply hour transform for type: {source}")
657699

658700
return lambda v: hour_func(v) if v is not None else None
659701

660702
def can_transform(self, source: IcebergType) -> bool:
661-
return isinstance(source, (TimestampType, TimestamptzType))
703+
return isinstance(source, (TimestampType, TimestamptzType, TimestampNanoType, TimestamptzNanoType))
662704

663705
@property
664706
def granularity(self) -> TimeResolution:
@@ -676,13 +718,17 @@ def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Arr
676718
import pyarrow.compute as pc
677719

678720
if isinstance(source, TimestampType):
679-
epoch = datetime.EPOCH_TIMESTAMP
721+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP)
680722
elif isinstance(source, TimestamptzType):
681-
epoch = datetime.EPOCH_TIMESTAMPTZ
723+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ)
724+
elif isinstance(source, TimestampNanoType):
725+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMP).cast(pa.timestamp("ns"))
726+
elif isinstance(source, TimestamptzNanoType):
727+
epoch = pa.scalar(datetime.EPOCH_TIMESTAMPTZ).cast(pa.timestamp("ns"))
682728
else:
683729
raise ValueError(f"Cannot apply hour transform for type: {source}")
684730

685-
return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not None else None
731+
return lambda v: pc.hours_between(epoch, v) if v is not None else None
686732

687733

688734
def _base64encode(buffer: bytes) -> str:

pyiceberg/utils/datetime.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,38 @@ def days_to_years(days: int) -> int:
217217

218218
def micros_to_years(micros: int) -> int:
219219
return micros_to_timestamp(micros).year - EPOCH_TIMESTAMP.year
220+
221+
222+
def nanos_to_timestamp(nanos: int) -> datetime:
223+
"""Convert nanoseconds from epoch to a microsecond timestamp."""
224+
dt = timedelta(microseconds=nanos // 1000)
225+
return EPOCH_TIMESTAMP + dt
226+
227+
228+
def nanos_to_years(nanos: int) -> int:
229+
return nanos_to_timestamp(nanos).year - EPOCH_TIMESTAMP.year
230+
231+
232+
def nanos_to_months(nanos: int) -> int:
233+
dt = nanos_to_timestamp(nanos)
234+
return (dt.year - EPOCH_TIMESTAMP.year) * 12 + (dt.month - EPOCH_TIMESTAMP.month)
235+
236+
237+
def nanos_to_days(nanos: int) -> int:
238+
"""Convert a timestamp in nanoseconds to a date in days."""
239+
return timedelta(microseconds=nanos // 1000).days
240+
241+
242+
def nanos_to_time(nanos: int) -> time:
243+
"""Convert a timestamp in nanoseconds to a microsecond precision time."""
244+
micros = nanos // 1000
245+
micros, microseconds = divmod(micros, 1000000)
246+
micros, seconds = divmod(micros, 60)
247+
micros, minutes = divmod(micros, 60)
248+
hours = micros
249+
return time(hour=hours, minute=minutes, second=seconds, microsecond=microseconds)
250+
251+
252+
def nanos_to_hours(nanos: int) -> int:
253+
"""Convert a timestamp in nanoseconds to hours from 1970-01-01T00:00."""
254+
return nanos // 3_600_000_000_0000

tests/test_transforms.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1559,7 +1559,7 @@ def test_ymd_pyarrow_transforms(
15591559
]
15601560
else:
15611561
with pytest.raises(ValueError):
1562-
transform.pyarrow_transform(DateType())(arrow_table_date_timestamps[source_col])
1562+
transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col])
15631563

15641564

15651565
@pytest.mark.parametrize(

tests/utils/test_datetime.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def test_millis_to_datetime() -> None:
8080
assert millis_to_datetime(1690971805918) == datetime(2023, 8, 2, 10, 23, 25, 918000)
8181

8282

83-
@pytest.mark.parametrize("time_str, nanos", [("00:00:00Z", 0), ("20:21:44.375612-0500", 73304375612000)])
83+
@pytest.mark.parametrize("time_str, nanos", [("00:00:00Z", 0), ("20:21:44.375612-05:00", 73304375612000)])
8484
def test_time_str_to_nanos(time_str: str, nanos: int) -> None:
8585
assert nanos == time_str_to_nanos(time_str)
8686

0 commit comments

Comments
 (0)