Skip to content

Commit 629e2d5

Browse files
committed
rewrite the writer for support both bucket and identity transform
1 parent 245a1ef commit 629e2d5

File tree

4 files changed

+45
-20
lines changed

4 files changed

+45
-20
lines changed

pyiceberg/avro/writer.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
List,
3333
Optional,
3434
Tuple,
35+
Union,
3536
)
3637
from uuid import UUID
3738

@@ -121,8 +122,23 @@ def write(self, encoder: BinaryEncoder, val: Any) -> None:
121122

122123
@dataclass(frozen=True)
123124
class UUIDWriter(Writer):
124-
def write(self, encoder: BinaryEncoder, val: UUID) -> None:
125-
encoder.write(val.bytes)
125+
def write(self, encoder: BinaryEncoder, val: Union[UUID, str, bytes]) -> None:
126+
if isinstance(val, UUID):
127+
encoder.write(val.bytes)
128+
elif isinstance(val, bytes):
129+
encoder.write(val)
130+
elif isinstance(val, str):
131+
print(f"UUID string: {val=}")
132+
if val.startswith("b'") and val.endswith("'"):
133+
# Handle string representation of bytes
134+
# Convert the escaped string to actual bytes
135+
byte_string = val[2:-1].encode("utf-8").decode("unicode_escape").encode("latin1")
136+
encoder.write(UUID(bytes=byte_string).bytes)
137+
else:
138+
# Regular UUID string
139+
encoder.write(UUID(val).bytes)
140+
else:
141+
raise TypeError(f"Expected UUID, bytes, or string, got {type(val)}")
126142

127143

128144
@dataclass(frozen=True)

pyiceberg/conversions.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,13 @@ def _(_: StringType, value: str) -> bytes:
267267

268268

269269
@to_bytes.register(UUIDType)
270-
def _(_: UUIDType, value: Union[uuid.UUID, bytes]) -> bytes:
270+
def _(_: UUIDType, value: Union[uuid.UUID, bytes, str]) -> bytes:
271271
if isinstance(value, bytes):
272-
return value
273-
return value.bytes
272+
return str(uuid.UUID(bytes=value)).encode(UTF8)
273+
elif isinstance(value, uuid.UUID):
274+
return str(value).encode(UTF8)
275+
else:
276+
return str(uuid.UUID(value)).encode(UTF8)
274277

275278

276279
@to_bytes.register(BinaryType)
@@ -355,11 +358,15 @@ def _(_: StringType, b: bytes) -> str:
355358

356359
@from_bytes.register(BinaryType)
357360
@from_bytes.register(FixedType)
358-
@from_bytes.register(UUIDType)
359361
def _(_: PrimitiveType, b: bytes) -> bytes:
360362
return b
361363

362364

365+
@from_bytes.register(UUIDType)
366+
def _(_: UUIDType, b: bytes) -> uuid.UUID:
367+
return uuid.UUID(b.decode(UTF8))
368+
369+
363370
@from_bytes.register(DecimalType)
364371
def _(primitive_type: DecimalType, buf: bytes) -> Decimal:
365372
unscaled = int.from_bytes(buf, "big", signed=True)

pyiceberg/expressions/literals.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def literal(value: L) -> Literal[L]:
144144
elif isinstance(value, str):
145145
return StringLiteral(value)
146146
elif isinstance(value, UUID):
147-
return UUIDLiteral(value.bytes) # type: ignore
147+
return UUIDLiteral(UUID(bytes=value.bytes))
148148
elif isinstance(value, bytes):
149149
return BinaryLiteral(value)
150150
elif isinstance(value, Decimal):
@@ -586,8 +586,8 @@ def _(self, _: TimestamptzType) -> Literal[int]:
586586
return TimestampLiteral(timestamptz_to_micros(self.value))
587587

588588
@to.register(UUIDType)
589-
def _(self, _: UUIDType) -> Literal[bytes]:
590-
return UUIDLiteral(UUID(self.value).bytes)
589+
def _(self, _: UUIDType) -> Literal[UUID]:
590+
return UUIDLiteral(UUID(self.value))
591591

592592
@to.register(DecimalType)
593593
def _(self, type_var: DecimalType) -> Literal[Decimal]:
@@ -631,30 +631,30 @@ def __repr__(self) -> str:
631631
return f"literal({repr(self.value)})"
632632

633633

634-
class UUIDLiteral(Literal[bytes]):
635-
def __init__(self, value: bytes) -> None:
636-
super().__init__(value, bytes)
634+
class UUIDLiteral(Literal[UUID]):
635+
def __init__(self, value: UUID) -> None:
636+
super().__init__(value, UUID)
637637

638638
@singledispatchmethod
639639
def to(self, type_var: IcebergType) -> Literal: # type: ignore
640640
raise TypeError(f"Cannot convert UUIDLiteral into {type_var}")
641641

642642
@to.register(UUIDType)
643-
def _(self, _: UUIDType) -> Literal[bytes]:
643+
def _(self, _: UUIDType) -> Literal[UUID]:
644644
return self
645645

646646
@to.register(FixedType)
647647
def _(self, type_var: FixedType) -> Literal[bytes]:
648648
if len(type_var) == UUID_BYTES_LENGTH:
649-
return FixedLiteral(self.value)
649+
return FixedLiteral(self.value.bytes)
650650
else:
651651
raise TypeError(
652652
f"Cannot convert UUIDLiteral into {type_var}, different length: {len(type_var)} <> {UUID_BYTES_LENGTH}"
653653
)
654654

655655
@to.register(BinaryType)
656656
def _(self, _: BinaryType) -> Literal[bytes]:
657-
return BinaryLiteral(self.value)
657+
return BinaryLiteral(self.value.bytes)
658658

659659

660660
class FixedLiteral(Literal[bytes]):
@@ -679,9 +679,9 @@ def _(self, _: BinaryType) -> Literal[bytes]:
679679
return BinaryLiteral(self.value)
680680

681681
@to.register(UUIDType)
682-
def _(self, type_var: UUIDType) -> Literal[bytes]:
682+
def _(self, type_var: UUIDType) -> Literal[UUID]:
683683
if len(self.value) == UUID_BYTES_LENGTH:
684-
return UUIDLiteral(self.value)
684+
return UUIDLiteral(UUID(bytes=self.value))
685685
else:
686686
raise TypeError(
687687
f"Could not convert {self.value!r} into a {type_var}, lengths differ {len(self.value)} <> {UUID_BYTES_LENGTH}"
@@ -710,9 +710,9 @@ def _(self, type_var: FixedType) -> Literal[bytes]:
710710
)
711711

712712
@to.register(UUIDType)
713-
def _(self, type_var: UUIDType) -> Literal[bytes]:
713+
def _(self, type_var: UUIDType) -> Literal[UUID]:
714714
if len(self.value) == UUID_BYTES_LENGTH:
715-
return UUIDLiteral(self.value)
715+
return UUIDLiteral(UUID(bytes=self.value))
716716
else:
717717
raise TypeError(
718718
f"Cannot convert BinaryLiteral into {type_var}, different length: {UUID_BYTES_LENGTH} <> {len(self.value)}"

pyiceberg/partitioning.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,9 +467,11 @@ def _(type: IcebergType, value: Optional[time]) -> Optional[int]:
467467

468468

469469
@_to_partition_representation.register(UUIDType)
470-
def _(type: IcebergType, value: Optional[Union[uuid.UUID, int]]) -> Optional[Union[str, int]]:
470+
def _(type: IcebergType, value: Optional[Union[uuid.UUID, int, bytes]]) -> Optional[Union[str, int]]:
471471
if value is None:
472472
return None
473+
elif isinstance(value, bytes):
474+
return str(uuid.UUID(bytes=value)) # IdentityTransform
473475
elif isinstance(value, uuid.UUID):
474476
return str(value) # IdentityTransform
475477
elif isinstance(value, int):

0 commit comments

Comments
 (0)