From 8c61199cee52c138c2df2a7c4c15bddf408f0e26 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 May 2024 13:14:47 -0400 Subject: [PATCH 1/6] add test --- tests/test_schema.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_schema.py b/tests/test_schema.py index 96109ce9c2..1fdfd66e63 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -123,6 +123,12 @@ def test_schema_raise_on_duplicate_names() -> None: assert "Invalid schema, multiple fields for name baz: 3 and 4" in str(exc_info.value) +def test_schema_field_order_irrelevant() -> None: + foo = NestedField(field_id=1, name="foo", field_type=StringType()) + bar = NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False) + assert schema.Schema(foo, bar) == schema.Schema(bar, foo) + + def test_schema_index_by_id_visitor(table_schema_nested: Schema) -> None: """Test index_by_id visitor function""" index = schema.index_by_id(table_schema_nested) From 79c73eb35533686834839928ae6693a06cd6d78b Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 May 2024 14:12:57 -0400 Subject: [PATCH 2/6] compare fields with set --- pyiceberg/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index b2739d8618..e9177b41ce 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -121,7 +121,7 @@ def __eq__(self, other: Any) -> bool: return False identifier_field_ids_is_equal = self.identifier_field_ids == other.identifier_field_ids - schema_is_equal = all(lhs == rhs for lhs, rhs in zip(self.columns, other.columns)) + schema_is_equal = set(self.columns) == set(other.columns) return identifier_field_ids_is_equal and schema_is_equal From 9b27631169dcda7af6fe202a56d19127f1390d6f Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 May 2024 14:14:52 -0400 Subject: [PATCH 3/6] fix StructType eq --- pyiceberg/types.py | 2 +- tests/test_schema.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pyiceberg/types.py b/pyiceberg/types.py index 7e3862b7d3..6ef40c18df 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -393,7 +393,7 @@ def __hash__(self) -> int: def __eq__(self, other: Any) -> bool: """Compare the object if it is equal to another object.""" - return self.fields == other.fields if isinstance(other, StructType) else False + return set(self.fields) == set(other.fields) if isinstance(other, StructType) else False class ListType(IcebergType): diff --git a/tests/test_schema.py b/tests/test_schema.py index 1fdfd66e63..5f2626fd14 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -126,7 +126,10 @@ def test_schema_raise_on_duplicate_names() -> None: def test_schema_field_order_irrelevant() -> None: foo = NestedField(field_id=1, name="foo", field_type=StringType()) bar = NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False) - assert schema.Schema(foo, bar) == schema.Schema(bar, foo) + left = schema.Schema(foo, bar) + right = schema.Schema(bar, foo) + assert left == right + assert left.as_struct() == right.as_struct() def test_schema_index_by_id_visitor(table_schema_nested: Schema) -> None: From eae620632a81d1f0f790ad04cc6add6bba01046c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 May 2024 14:45:20 -0400 Subject: [PATCH 4/6] sort and compare --- pyiceberg/schema.py | 8 +++++++- pyiceberg/types.py | 13 ++++++++++++- tests/avro/test_resolver.py | 2 +- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index e9177b41ce..95efd350a6 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -121,7 +121,13 @@ def __eq__(self, other: Any) -> bool: return False identifier_field_ids_is_equal = self.identifier_field_ids == other.identifier_field_ids - schema_is_equal = set(self.columns) == set(other.columns) + + def order_by_field_id(field: NestedField) -> int: + return field.field_id + + left = sorted(self.columns, key=order_by_field_id) + right = sorted(other.columns, key=order_by_field_id) + schema_is_equal = all(lhs == rhs for lhs, rhs in zip(left, right)) return identifier_field_ids_is_equal and schema_is_equal diff --git a/pyiceberg/types.py b/pyiceberg/types.py index 6ef40c18df..a63c0b9e90 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -393,7 +393,18 @@ def __hash__(self) -> int: def __eq__(self, other: Any) -> bool: """Compare the object if it is equal to another object.""" - return set(self.fields) == set(other.fields) if isinstance(other, StructType) else False + if not isinstance(other, StructType): + return False + + if len(self.fields) != len(other.fields): + return False + + def order_by_field_id(field: NestedField) -> int: + return field.field_id + + left = sorted(self.fields, key=order_by_field_id) + right = sorted(other.fields, key=order_by_field_id) + return all(lhs == rhs for lhs, rhs in zip(left, right)) class ListType(IcebergType): diff --git a/tests/avro/test_resolver.py b/tests/avro/test_resolver.py index decd9060a4..ee1e444684 100644 --- a/tests/avro/test_resolver.py +++ b/tests/avro/test_resolver.py @@ -372,7 +372,7 @@ def test_writer_ordering() -> None: ), ) - expected = StructWriter(((1, DoubleWriter()), (0, StringWriter()))) + expected = StructWriter(((0, DoubleWriter()), (1, StringWriter()))) assert actual == expected From 00cc61a7c02f37c7af865134ee4c4816d65ea986 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 May 2024 14:50:35 -0400 Subject: [PATCH 5/6] inline function --- pyiceberg/schema.py | 7 ++----- pyiceberg/types.py | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 95efd350a6..984883a84d 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -122,11 +122,8 @@ def __eq__(self, other: Any) -> bool: identifier_field_ids_is_equal = self.identifier_field_ids == other.identifier_field_ids - def order_by_field_id(field: NestedField) -> int: - return field.field_id - - left = sorted(self.columns, key=order_by_field_id) - right = sorted(other.columns, key=order_by_field_id) + left = sorted(self.columns, key=lambda field: field.field_id) + right = sorted(other.columns, key=lambda field: field.field_id) schema_is_equal = all(lhs == rhs for lhs, rhs in zip(left, right)) return identifier_field_ids_is_equal and schema_is_equal diff --git a/pyiceberg/types.py b/pyiceberg/types.py index a63c0b9e90..98ce0a7c9c 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -399,11 +399,8 @@ def __eq__(self, other: Any) -> bool: if len(self.fields) != len(other.fields): return False - def order_by_field_id(field: NestedField) -> int: - return field.field_id - - left = sorted(self.fields, key=order_by_field_id) - right = sorted(other.fields, key=order_by_field_id) + left = sorted(self.fields, key=lambda field: field.field_id) + right = sorted(other.fields, key=lambda field: field.field_id) return all(lhs == rhs for lhs, rhs in zip(left, right)) From f16f284c94ed4b2d03cf0424bca0ecd18b4400d1 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 4 May 2024 14:57:01 -0400 Subject: [PATCH 6/6] fix integration test --- tests/integration/test_rest_schema.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_rest_schema.py b/tests/integration/test_rest_schema.py index ac5d1ce050..5aced3f128 100644 --- a/tests/integration/test_rest_schema.py +++ b/tests/integration/test_rest_schema.py @@ -1730,19 +1730,17 @@ def test_move_nested_field_after_first(catalog: Catalog) -> None: with tbl.update_schema() as schema_update: schema_update.move_before("struct.data", "struct.count") - assert str(tbl.schema()) == str( - Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=True), - NestedField( - field_id=2, - name="struct", - field_type=StructType( - NestedField(field_id=4, name="data", field_type=StringType(), required=True), - NestedField(field_id=3, name="count", field_type=LongType(), required=True), - ), - required=True, + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), ), - ) + required=True, + ), )