|
| 1 | +"""Tests for Windows encoding handling.""" |
| 2 | + |
| 3 | +from pathlib import Path |
| 4 | +from unittest.mock import patch |
| 5 | + |
| 6 | +from gitingest.schemas.filesystem import FileSystemNode, FileSystemNodeType |
| 7 | +from gitingest.utils.file_utils import _CHUNK_SIZE |
| 8 | + |
| 9 | + |
| 10 | +def test_utf8_priority_on_windows(tmp_path: Path) -> None: |
| 11 | + """Ensure UTF-8 files are read correctly even on Windows systems defaulting to cp1252. |
| 12 | +
|
| 13 | + This test reproduces a specific crash scenario where: |
| 14 | + 1. A file is valid UTF-8. |
| 15 | + 2. The first 1024 bytes are safe ASCII (passing the initial CP1252 check). |
| 16 | + 3. Subsequent bytes contain characters undefined in CP1252 (e.g., smart quotes), |
| 17 | + causing a UnicodeDecodeError if CP1252 is preferred over UTF-8. |
| 18 | + """ |
| 19 | + file_path = tmp_path / "test_encoding_crash.md" |
| 20 | + |
| 21 | + # The right double quotation mark (”) is: |
| 22 | + # - UTF-8: 0xE2 0x80 0x9D |
| 23 | + # - CP1252: Byte 0x9D is UNDEFINED and causes a crash during full read. |
| 24 | + poison_char = "”" |
| 25 | + |
| 26 | + # Fill buffer to bypass the initial chunk read (which checks the first 1024 bytes) |
| 27 | + content = ("a" * (_CHUNK_SIZE + 50)) + poison_char + "\nEnd." |
| 28 | + |
| 29 | + file_path.write_text(content, encoding="utf-8") |
| 30 | + |
| 31 | + node = FileSystemNode( |
| 32 | + name=file_path.name, |
| 33 | + type=FileSystemNodeType.FILE, |
| 34 | + path_str=str(file_path), |
| 35 | + path=file_path, |
| 36 | + size=file_path.stat().st_size, |
| 37 | + ) |
| 38 | + |
| 39 | + # Mock the environment to simulate Windows with CP1252 locale |
| 40 | + with patch("locale.getpreferredencoding", return_value="cp1252"), patch( |
| 41 | + "platform.system", |
| 42 | + return_value="Windows", |
| 43 | + ): |
| 44 | + read_content = node.content |
| 45 | + |
| 46 | + assert "Error reading file" not in read_content, "Failed to read valid UTF-8 file on Windows/CP1252 simulation" |
| 47 | + assert poison_char in read_content, "Failed to correctly decode the special character" |
| 48 | + assert read_content.endswith("End."), "Content appears truncated or malformed" |
0 commit comments