Skip to content

Commit 22e7c34

Browse files
committed
fix binary file check
1 parent 38c2317 commit 22e7c34

File tree

1 file changed

+13
-1
lines changed

1 file changed

+13
-1
lines changed

src/gitingest/schemas/filesystem.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
143143
if chunk == b"":
144144
return "[Empty file]"
145145

146-
if not _decodes(chunk, "utf-8"):
146+
if is_binary_file(chunk):
147147
return "[Binary file]"
148148

149149
# Find the first encoding that decodes the sample
@@ -160,3 +160,15 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
160160
return fp.read()
161161
except (OSError, UnicodeDecodeError) as exc:
162162
return f"Error reading file with {good_enc!r}: {exc}"
163+
164+
165+
def is_binary_file(file_contents: bytes | None) -> bool:
166+
"""Check whether a file is binary by reading its first 1024 bytes and looking for non-text characters."""
167+
if not file_contents:
168+
return False # Empty files are not binary
169+
170+
text_characters = bytes(
171+
{7, 8, 9, 10, 12, 13, 27}.union(set(range(0x20, 0x100)) - {0x7F}),
172+
)
173+
# If translate returns any bytes, those are non-text (binary) bytes
174+
return bool(file_contents.translate(None, text_characters))

0 commit comments

Comments
 (0)