From 25a267e0088f0292cd3a2e91156c737e6484b0ab Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 22 Jul 2025 12:48:18 -0700 Subject: [PATCH] add test for avro sanitization --- tests/test_schema.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_schema.py b/tests/test_schema.py index 3ca74c4027..6ee48299ef 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -563,6 +563,36 @@ def test_sanitize() -> None: assert sanitize_column_names(before_sanitized) == expected_schema +def test_sanitize_special_chars() -> None: + """Test sanitizing schema with special characters in field names, using only StringType fields.""" + # Test names with special characters: numbers at start, dots, unicode, hash + # Expected sanitized names: numbers prefixed with _, dots become _x2E, unicode becomes _x, hash becomes _x23 + names = ["9x", "x_", "a.b", "☃", "a#b"] + expected_names = ["_9x", "x_", "a_x2Eb", "_x2603", "a_x23b"] + + before_sanitized = Schema( + NestedField(field_id=1, name=names[0], field_type=StringType(), required=True), + NestedField(field_id=2, name=names[1], field_type=StringType(), required=True), + NestedField(field_id=3, name=names[2], field_type=StringType(), required=True), + NestedField(field_id=4, name=names[3], field_type=StringType(), required=True), + NestedField(field_id=5, name=names[4], field_type=StringType(), required=True), + schema_id=1, + identifier_field_ids=[1], + ) + + expected_schema = Schema( + NestedField(field_id=1, name=expected_names[0], field_type=StringType(), required=True), + NestedField(field_id=2, name=expected_names[1], field_type=StringType(), required=True), + NestedField(field_id=3, name=expected_names[2], field_type=StringType(), required=True), + NestedField(field_id=4, name=expected_names[3], field_type=StringType(), required=True), + NestedField(field_id=5, name=expected_names[4], field_type=StringType(), required=True), + schema_id=1, + identifier_field_ids=[1], + ) + + assert sanitize_column_names(before_sanitized) == expected_schema + + def test_prune_columns_string(table_schema_nested_with_struct_key_map: Schema) -> None: assert prune_columns(table_schema_nested_with_struct_key_map, {1}, False) == Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1]