Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,4 @@ Caddyfile

# ignore default output directory
tmp/*
digest.txt
2 changes: 1 addition & 1 deletion src/gitingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from gitingest.clone import clone_repo
from gitingest.clone import clone_repo, CloneConfig
from gitingest.ingest import ingest
from gitingest.ingest_from_query import ingest_from_query
from gitingest.parse_query import parse_query
Expand Down
45 changes: 40 additions & 5 deletions src/gitingest/cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import os
import pathlib
import click
import sys

from gitingest.ingest import ingest
from gitingest.ingest_from_query import MAX_FILE_SIZE
from .encoding import setup_encoding

# Setup encoding first
setup_encoding()

# Define constants
DEFAULT_IGNORE_PATTERNS = []

@click.command()
@click.argument("source", type=str, required=True)
Expand All @@ -19,14 +27,41 @@ def main(
) -> None:
"""Analyze a directory and create a text dump of its contents."""
try:
# Combine default and custom ignore patterns
from gitingest.ingest import ingest

# Convert paths to absolute with proper encoding
source = str(pathlib.Path(source).resolve())

# Handle patterns
exclude_patterns = list(exclude_pattern)
include_patterns = list(set(include_pattern))


# Set default output name
if not output:
output = "digest.txt"
summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output)

output = str(pathlib.Path(output).resolve())

# Call ingest with encoding awareness
summary, tree, content = ingest(
source,
max_size,
include_patterns,
exclude_patterns,
output=output
)

# Write output with explicit encoding
with open(output, 'w', encoding='utf-8', errors='replace') as f:
if isinstance(summary, bytes):
summary = summary.decode('utf-8', errors='replace')
if isinstance(tree, bytes):
tree = tree.decode('utf-8', errors='replace')
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')

f.write(f"{summary}\n\n{tree}\n\n{content}")

# Print messages with encoding handling
click.echo(f"Analysis complete! Output written to: {output}")
click.echo("\nSummary:")
click.echo(summary)
Expand Down
17 changes: 17 additions & 0 deletions src/gitingest/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import sys
import io
import codecs

def setup_encoding():
if sys.stdout.encoding != 'utf-8':
sys.stdout = io.TextIOWrapper(
sys.stdout.buffer,
encoding='utf-8',
errors='replace'
)
if sys.stderr.encoding != 'utf-8':
sys.stderr = io.TextIOWrapper(
sys.stderr.buffer,
encoding='utf-8',
errors='replace'
)
70 changes: 54 additions & 16 deletions src/gitingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,41 @@
import inspect
import shutil
from pathlib import Path
import io
import sys
from typing import Union

from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
# Import other modules from the package
from gitingest.parse_query import parse_query
from gitingest.clone import clone_repo, CloneConfig
from gitingest.ingest_from_query import ingest_from_query

def setup_encoding():
if sys.stdout.encoding != 'utf-8':
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
if sys.stderr.encoding != 'utf-8':
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

def ingest(
source: str,
max_file_size: int = 10 * 1024 * 1024, # 10 MB
include_patterns: list[str] | str | None = None,
exclude_patterns: list[str] | str | None = None,
output: str | None = None,
) -> tuple[str, str, str]:

def ingest(source: str, max_file_size: int = 10 * 1024 * 1024,
include_patterns: Union[list[str], str] = None,
exclude_patterns: Union[list[str], str] = None,
output: str = None) -> tuple[str, str, str]:
"""
Analyze and create a text dump of source contents.

Args:
source: Path to source directory or git URL
max_file_size: Maximum file size to process in bytes
include_patterns: Patterns to include in analysis
exclude_patterns: Patterns to exclude from analysis
output: Output file path

Returns:
Tuple of (summary, tree, content)
"""
setup_encoding()
query = None

try:
query = parse_query(
source=source,
Expand Down Expand Up @@ -43,14 +64,31 @@ def ingest(
summary, tree, content = ingest_from_query(query)

if output:
with open(f"{output}", "w") as f:
f.write(tree + "\n" + content)
# Write with explicit UTF-8 encoding
with open(output, "w", encoding='utf-8', errors='replace') as f:
# Ensure all content is properly encoded
tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree
content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content
f.write(f"{tree}\n{content}")

return summary, tree, content


except UnicodeEncodeError as e:
# Handle encoding errors specifically
error_msg = f"Encoding error while processing {source}: {str(e)}"
raise RuntimeError(error_msg)

except Exception as e:
# Handle other errors
error_msg = f"Error while processing {source}: {str(e)}"
raise RuntimeError(error_msg)

finally:
# Clean up the temporary directory if it was created
if query["url"]:
if query and query.get('url'):
# Get parent directory two levels up from local_path (../tmp)
cleanup_path = str(Path(query["local_path"]).parents[1])
shutil.rmtree(cleanup_path, ignore_errors=True)
cleanup_path = str(Path(query['local_path']).parents[1])
try:
shutil.rmtree(cleanup_path, ignore_errors=True)
except Exception as e:
print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr)
Loading