|
3 | 3 | from fnmatch import fnmatch |
4 | 4 | from pathlib import Path |
5 | 5 | from typing import Any |
| 6 | +import locale |
| 7 | +import os |
| 8 | +import platform |
6 | 9 |
|
7 | 10 | import tiktoken |
8 | 11 |
|
|
16 | 19 | from gitingest.notebook_utils import process_notebook |
17 | 20 | from gitingest.query_parser import ParsedQuery |
18 | 21 |
|
| 22 | +try: |
| 23 | + locale.setlocale(locale.LC_ALL, '') |
| 24 | +except locale.Error: |
| 25 | + locale.setlocale(locale.LC_ALL, 'C') |
| 26 | + |
| 27 | +def _normalize_path(path: Path) -> Path: |
| 28 | + """Normalize path for cross-platform compatibility.""" |
| 29 | + return Path(os.path.normpath(str(path))) |
| 30 | + |
| 31 | +def _normalize_path_str(path: str | Path) -> str: |
| 32 | + """Convert path to string with forward slashes for consistent output.""" |
| 33 | + return str(path).replace(os.sep, '/') |
| 34 | + |
| 35 | +def _get_encoding_list() -> list[str]: |
| 36 | + """Get list of encodings to try, prioritized for the current platform.""" |
| 37 | + encodings = ['utf-8', 'utf-8-sig'] |
| 38 | + if platform.system() == 'Windows': |
| 39 | + encodings.extend(['cp1252', 'iso-8859-1']) |
| 40 | + return encodings + [locale.getpreferredencoding()] |
19 | 41 |
|
20 | 42 | def _should_include(path: Path, base_path: Path, include_patterns: set[str]) -> bool: |
21 | 43 | """ |
@@ -107,9 +129,13 @@ def _is_safe_symlink(symlink_path: Path, base_path: Path) -> bool: |
107 | 129 | `True` if the symlink points within the base directory, `False` otherwise. |
108 | 130 | """ |
109 | 131 | try: |
110 | | - target_path = symlink_path.resolve() |
111 | | - base_resolved = base_path.resolve() |
112 | | - # It's "safe" if target_path == base_resolved or is inside base_resolved |
| 132 | + if platform.system() == 'Windows': |
| 133 | + if not os.path.islink(str(symlink_path)): |
| 134 | + return False |
| 135 | + |
| 136 | + target_path = _normalize_path(symlink_path.resolve()) |
| 137 | + base_resolved = _normalize_path(base_path.resolve()) |
| 138 | + |
113 | 139 | return base_resolved in target_path.parents or target_path == base_resolved |
114 | 140 | except (OSError, ValueError): |
115 | 141 | # If there's any error resolving the paths, consider it unsafe |
@@ -162,10 +188,22 @@ def _read_file_content(file_path: Path) -> str: |
162 | 188 | """ |
163 | 189 | try: |
164 | 190 | if file_path.suffix == ".ipynb": |
165 | | - return process_notebook(file_path) |
| 191 | + try: |
| 192 | + return process_notebook(file_path) |
| 193 | + except Exception as e: |
| 194 | + return f"Error processing notebook: {e}" |
| 195 | + |
| 196 | + for encoding in _get_encoding_list(): |
| 197 | + try: |
| 198 | + with open(file_path, encoding=encoding) as f: |
| 199 | + return f.read() |
| 200 | + except UnicodeDecodeError: |
| 201 | + continue |
| 202 | + except OSError as e: |
| 203 | + return f"Error reading file: {e}" |
| 204 | + |
| 205 | + return "Error: Unable to decode file with available encodings" |
166 | 206 |
|
167 | | - with open(file_path, encoding="utf-8", errors="ignore") as f: |
168 | | - return f.read() |
169 | 207 | except (OSError, InvalidNotebookError) as e: |
170 | 208 | return f"Error reading file: {e}" |
171 | 209 |
|
@@ -531,10 +569,10 @@ def _extract_files_content( |
531 | 569 | content = node["content"] |
532 | 570 |
|
533 | 571 | relative_path = Path(node["path"]).relative_to(query.local_path) |
534 | | - |
| 572 | + # Store paths with forward slashes |
535 | 573 | files.append( |
536 | 574 | { |
537 | | - "path": str(relative_path), |
| 575 | + "path": _normalize_path_str(relative_path), |
538 | 576 | "content": content, |
539 | 577 | "size": node["size"], |
540 | 578 | }, |
@@ -572,7 +610,8 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str: |
572 | 610 | continue |
573 | 611 |
|
574 | 612 | output += separator |
575 | | - output += f"File: {file['path']}\n" |
| 613 | + # Use forward slashes in output paths |
| 614 | + output += f"File: {_normalize_path_str(file['path'])}\n" |
576 | 615 | output += separator |
577 | 616 | output += f"{file['content']}\n\n" |
578 | 617 |
|
@@ -815,11 +854,13 @@ def run_ingest_query(query: ParsedQuery) -> tuple[str, str, str]: |
815 | 854 | ValueError |
816 | 855 | If the specified path cannot be found or if the file is not a text file. |
817 | 856 | """ |
818 | | - path = query.local_path / query.subpath.lstrip("/") |
| 857 | + subpath = _normalize_path(Path(query.subpath.strip("/"))).as_posix() |
| 858 | + path = _normalize_path(query.local_path / subpath) |
| 859 | + |
819 | 860 | if not path.exists(): |
820 | 861 | raise ValueError(f"{query.slug} cannot be found") |
821 | 862 |
|
822 | 863 | if query.type and query.type == "blob": |
823 | | - return _ingest_single_file(path, query) |
| 864 | + return _ingest_single_file(_normalize_path(path.resolve()), query) |
824 | 865 |
|
825 | | - return _ingest_directory(path, query) |
| 866 | + return _ingest_directory(_normalize_path(path.resolve()), query) |
0 commit comments