Skip to content

Commit 78cebcb

Browse files
committed
fix: prioritize utf-8 over system locale to prevent crashes on Windows
1 parent 4e259a0 commit 78cebcb

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

src/gitingest/utils/file_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def _get_preferred_encodings() -> list[str]:
2727
platform's default encoding followed by common fallback encodings.
2828
2929
"""
30-
encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"]
30+
encodings = ["utf-8", locale.getpreferredencoding(), "utf-16", "utf-16le", "utf-8-sig", "latin"]
3131
if platform.system() == "Windows":
3232
encodings += ["cp1252", "iso-8859-1"]
3333
return list(dict.fromkeys(encodings))

tests/test_windows_encoding.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Tests for Windows encoding handling."""
2+
3+
from pathlib import Path
4+
from unittest.mock import patch
5+
6+
from gitingest.schemas.filesystem import FileSystemNode, FileSystemNodeType
7+
from gitingest.utils.file_utils import _CHUNK_SIZE
8+
9+
10+
def test_utf8_priority_on_windows(tmp_path: Path) -> None:
11+
"""Ensure UTF-8 files are read correctly even on Windows systems defaulting to cp1252.
12+
13+
This test reproduces a specific crash scenario where:
14+
1. A file is valid UTF-8.
15+
2. The first 1024 bytes are safe ASCII (passing the initial CP1252 check).
16+
3. Subsequent bytes contain characters undefined in CP1252 (e.g., smart quotes),
17+
causing a UnicodeDecodeError if CP1252 is preferred over UTF-8.
18+
"""
19+
file_path = tmp_path / "test_encoding_crash.md"
20+
21+
# The right double quotation mark (”) is:
22+
# - UTF-8: 0xE2 0x80 0x9D
23+
# - CP1252: Byte 0x9D is UNDEFINED and causes a crash during full read.
24+
poison_char = "”"
25+
26+
# Fill buffer to bypass the initial chunk read (which checks the first 1024 bytes)
27+
content = ("a" * (_CHUNK_SIZE + 50)) + poison_char + "\nEnd."
28+
29+
file_path.write_text(content, encoding="utf-8")
30+
31+
node = FileSystemNode(
32+
name=file_path.name,
33+
type=FileSystemNodeType.FILE,
34+
path_str=str(file_path),
35+
path=file_path,
36+
size=file_path.stat().st_size,
37+
)
38+
39+
# Mock the environment to simulate Windows with CP1252 locale
40+
with patch("locale.getpreferredencoding", return_value="cp1252"), patch(
41+
"platform.system",
42+
return_value="Windows",
43+
):
44+
read_content = node.content
45+
46+
assert "Error reading file" not in read_content, "Failed to read valid UTF-8 file on Windows/CP1252 simulation"
47+
assert poison_char in read_content, "Failed to correctly decode the special character"
48+
assert read_content.endswith("End."), "Content appears truncated or malformed"

0 commit comments

Comments
 (0)