From 7ec79b41ace53749fbe8236659b177aa118791b6 Mon Sep 17 00:00:00 2001 From: Dai Hung PHAM Date: Fri, 27 Dec 2024 11:11:22 +0100 Subject: [PATCH 1/6] a --- digest.txt | 4155 +++++++++++++++++++++++++++++++++++++ src/gitingest/__init__.py | 8 +- src/gitingest/cli.py | 56 +- src/gitingest/encoding.py | 17 + src/gitingest/ingest.py | 62 +- 5 files changed, 4276 insertions(+), 22 deletions(-) create mode 100644 digest.txt create mode 100644 src/gitingest/encoding.py diff --git a/digest.txt b/digest.txt new file mode 100644 index 00000000..bbc85aa4 --- /dev/null +++ b/digest.txt @@ -0,0 +1,4155 @@ +Repository: __temp/gitingest +Files analyzed: 76 + +Estimated tokens: 35.5k + +Directory structure: +└── __temp/gitingest/ + ├── .dockerignore + ├── .env + ├── .git/ + │ ├── config + │ ├── description + │ ├── HEAD + │ ├── hooks/ + │ │ ├── applypatch-msg.sample + │ │ ├── commit-msg.sample + │ │ ├── fsmonitor-watchman.sample + │ │ ├── post-update.sample + │ │ ├── pre-applypatch.sample + │ │ ├── pre-commit.sample + │ │ ├── pre-merge-commit.sample + │ │ ├── pre-push.sample + │ │ ├── pre-rebase.sample + │ │ ├── pre-receive.sample + │ │ ├── prepare-commit-msg.sample + │ │ ├── push-to-checkout.sample + │ │ ├── sendemail-validate.sample + │ │ └── update.sample + │ ├── index + │ ├── info/ + │ │ └── exclude + │ ├── logs/ + │ │ ├── HEAD + │ │ └── refs/ + │ │ ├── heads/ + │ │ │ └── main + │ │ └── remotes/ + │ │ └── origin/ + │ │ └── HEAD + │ ├── objects/ + │ │ ├── info/ + │ │ └── pack/ + │ │ ├── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.idx + │ │ ├── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.pack + │ │ └── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.rev + │ ├── packed-refs + │ └── refs/ + │ ├── heads/ + │ │ └── main + │ ├── remotes/ + │ │ └── origin/ + │ │ └── HEAD + │ └── tags/ + ├── .github/ + │ └── workflows/ + │ └── unitest.yml + ├── .gitignore + ├── .venv/ + │ ├── Lib/ + │ │ └── site-packages/ + │ │ ├── charset_normalizer/ + │ │ ├── httptools/ + │ │ │ └── parser/ + │ │ ├── markupsafe/ + │ │ ├── pydantic_core/ + │ │ ├── regex/ + │ │ ├── tiktoken/ + │ │ ├── watchfiles/ + │ │ ├── websockets/ + │ │ └── wrapt/ + │ └── Scripts/ + │ ├── python.exe + │ └── uvicorn.exe + ├── CODE_OF_CONDUCT.md + ├── digest.txt + ├── Dockerfile + ├── docs/ + ├── LICENSE + ├── pytest.ini + ├── README.md + ├── requirements.txt + ├── SECURITY.md + ├── setup.py + └── src/ + ├── config.py + ├── gitingest/ + │ ├── cli.py + │ ├── clone.py + │ ├── encoding.py + │ ├── ingest.py + │ ├── ingest_from_query.py + │ ├── parse_query.py + │ ├── tests/ + │ │ ├── conftest.py + │ │ ├── test_clone.py + │ │ ├── test_ingest.py + │ │ ├── test_parse_query.py + │ │ └── __init__.py + │ ├── utils.py + │ ├── __init__.py + │ └── __pycache__/ + ├── main.py + ├── process_query.py + ├── routers/ + │ ├── download.py + │ ├── dynamic.py + │ ├── index.py + │ ├── __init__.py + │ └── __pycache__/ + ├── server_utils.py + ├── static/ + │ ├── js/ + │ │ ├── snow.js + │ │ └── utils.js + │ └── robots.txt + ├── templates/ + │ ├── api.jinja + │ ├── base.jinja + │ ├── components/ + │ │ ├── footer.jinja + │ │ ├── github_form.jinja + │ │ ├── navbar.jinja + │ │ └── result.jinja + │ ├── github.jinja + │ └── index.jinja + ├── __init__.py + └── __pycache__/ + + +================================================ +File: /README.md +================================================ +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com/) + +![License](https://img.shields.io/badge/license-MIT-blue.svg) + +# GitIngest 🔍 +Turn any Git repository into a prompt-friendly text ingest for LLMs. + +You can also replace `hub` with `ingest` in any github url to access the coresponding digest + +[gitingest.com](https://gitingest.com/) + + +## 🚀 Features + +- **Easy code context**: Get a text digest from a git repository URL or a directory +- **Smart Formatting**: Optimized output format for LLM prompts +- **Statistics about**: : + - File and directory structure + - Size of the extract + - Token count +- **CLI tool**: Run it as a command (Currently on Linux only) +- **Python package**: Import it in your code + + +## 📦 Installation + +``` +pip install gitingest +``` + + +## 💡 Command Line usage + +The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. + +```bash +# Basic usage +gitingest /path/to/directory + +# From url +gitingest https://github.com/cyclotruc/gitingest + +# See more options +gitingest --help +``` + +This will write the digest in a text file (default `digest.txt`) in your current working directory. + + +## 🐛 Python package usage + + +```python +from gitingest import ingest + +summary, tree, content = ingest("path/to/directory") + +#or from URL +summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") +``` + +By default, this won't write a file but can be enabled with the `output` argument + + +## 🛠️ Using +- Tailwind CSS - Frontend +- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework +- [tiktoken](https://github.com/openai/tiktoken) - Token estimation +- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics + + +## 🌐 Self-host +1. Build the image: +``` +docker build -t gitingest . +``` + +2. Run the container: +``` +docker run -d --name gitingest -p 8000:8000 gitingest +``` +The application will be available at `http://localhost:8000` +Ensure environment variables are set before running the application or deploying it via Docker. + +## ✔️ Contributing + +Contributions are welcome! + +Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) + +### Ways to contribute + +1. Provide your feedback and ideas on discord +2. Open an Issue on github to report a bug +2. Create a Pull request + - Fork the repository + - Make your changes and test them locally + - Open a pull request for review and feedback + +### 🔧 Local dev + +#### Environment Configuration +- **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. +You can configure the application using the following environment variables: + +```bash +ALLOWED_HOSTS="gitingest.local,localhost" +``` + +#### Run locally +1. Clone the repository +```bash +git clone https://github.com/cyclotruc/gitingest.git +cd gitingest +``` + +2. Install dependencies +```bash +pip install -r requirements.txt +``` + +3. Run the application: +```bash +cd src +uvicorn main:app --reload +``` + +The frontend will be available at `localhost:8000` + + + + +================================================ +File: /.dockerignore +================================================ +# Git +.git +.gitignore + +# Python +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env +pip-log.txt +pip-delete-this-directory.txt +.tox +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.log + +# Virtual environment +venv +.env +.venv +ENV + +# IDE +.idea +.vscode +*.swp +*.swo + +# Project specific +docs/ +tests/ +*.md +LICENSE +pytest.ini +setup.py + + +================================================ +File: /.env +================================================ +ALLOWED_HOSTS="gitingest.local,localhost" + + +================================================ +File: /.git\config +================================================ +[core] + repositoryformatversion = 0 + filemode = false + bare = false + logallrefupdates = true + symlinks = false + ignorecase = true +[remote "origin"] + url = https://github.com/cyclotruc/gitingest.git + fetch = +refs/heads/*:refs/remotes/origin/* +[branch "main"] + remote = origin + merge = refs/heads/main + vscode-merge-base = origin/main + + +================================================ +File: /.git\description +================================================ +Unnamed repository; edit this file 'description' to name the repository. + + +================================================ +File: /.git\HEAD +================================================ +ref: refs/heads/main + + +================================================ +File: /.git\hooks\applypatch-msg.sample +================================================ +#!/bin/sh +# +# An example hook script to check the commit log message taken by +# applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. The hook is +# allowed to edit the commit message file. +# +# To enable this hook, rename this file to "applypatch-msg". + +. git-sh-setup +commitmsg="$(git rev-parse --git-path hooks/commit-msg)" +test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"} +: + + +================================================ +File: /.git\hooks\commit-msg.sample +================================================ +#!/bin/sh +# +# An example hook script to check the commit log message. +# Called by "git commit" with one argument, the name of the file +# that has the commit message. The hook should exit with non-zero +# status after issuing an appropriate message if it wants to stop the +# commit. The hook is allowed to edit the commit message file. +# +# To enable this hook, rename this file to "commit-msg". + +# Uncomment the below to add a Signed-off-by line to the message. +# Doing this in a hook is a bad idea in general, but the prepare-commit-msg +# hook is more suited to it. +# +# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" + +# This example catches duplicate Signed-off-by lines. + +test "" = "$(grep '^Signed-off-by: ' "$1" | + sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { + echo >&2 Duplicate Signed-off-by lines. + exit 1 +} + + +================================================ +File: /.git\hooks\fsmonitor-watchman.sample +================================================ +#!/usr/bin/perl + +use strict; +use warnings; +use IPC::Open2; + +# An example hook script to integrate Watchman +# (https://facebook.github.io/watchman/) with git to speed up detecting +# new and modified files. +# +# The hook is passed a version (currently 2) and last update token +# formatted as a string and outputs to stdout a new update token and +# all files that have been modified since the update token. Paths must +# be relative to the root of the working tree and separated by a single NUL. +# +# To enable this hook, rename this file to "query-watchman" and set +# 'git config core.fsmonitor .git/hooks/query-watchman' +# +my ($version, $last_update_token) = @ARGV; + +# Uncomment for debugging +# print STDERR "$0 $version $last_update_token\n"; + +# Check the hook interface version +if ($version ne 2) { + die "Unsupported query-fsmonitor hook version '$version'.\n" . + "Falling back to scanning...\n"; +} + +my $git_work_tree = get_working_dir(); + +my $retry = 1; + +my $json_pkg; +eval { + require JSON::XS; + $json_pkg = "JSON::XS"; + 1; +} or do { + require JSON::PP; + $json_pkg = "JSON::PP"; +}; + +launch_watchman(); + +sub launch_watchman { + my $o = watchman_query(); + if (is_work_tree_watched($o)) { + output_result($o->{clock}, @{$o->{files}}); + } +} + +sub output_result { + my ($clockid, @files) = @_; + + # Uncomment for debugging watchman output + # open (my $fh, ">", ".git/watchman-output.out"); + # binmode $fh, ":utf8"; + # print $fh "$clockid\n@files\n"; + # close $fh; + + binmode STDOUT, ":utf8"; + print $clockid; + print "\0"; + local $, = "\0"; + print @files; +} + +sub watchman_clock { + my $response = qx/watchman clock "$git_work_tree"/; + die "Failed to get clock id on '$git_work_tree'.\n" . + "Falling back to scanning...\n" if $? != 0; + + return $json_pkg->new->utf8->decode($response); +} + +sub watchman_query { + my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty') + or die "open2() failed: $!\n" . + "Falling back to scanning...\n"; + + # In the query expression below we're asking for names of files that + # changed since $last_update_token but not from the .git folder. + # + # To accomplish this, we're using the "since" generator to use the + # recency index to select candidate nodes and "fields" to limit the + # output to file names only. Then we're using the "expression" term to + # further constrain the results. + my $last_update_line = ""; + if (substr($last_update_token, 0, 1) eq "c") { + $last_update_token = "\"$last_update_token\""; + $last_update_line = qq[\n"since": $last_update_token,]; + } + my $query = <<" END"; + ["query", "$git_work_tree", {$last_update_line + "fields": ["name"], + "expression": ["not", ["dirname", ".git"]] + }] + END + + # Uncomment for debugging the watchman query + # open (my $fh, ">", ".git/watchman-query.json"); + # print $fh $query; + # close $fh; + + print CHLD_IN $query; + close CHLD_IN; + my $response = do {local $/; }; + + # Uncomment for debugging the watch response + # open ($fh, ">", ".git/watchman-response.json"); + # print $fh $response; + # close $fh; + + die "Watchman: command returned no output.\n" . + "Falling back to scanning...\n" if $response eq ""; + die "Watchman: command returned invalid output: $response\n" . + "Falling back to scanning...\n" unless $response =~ /^\{/; + + return $json_pkg->new->utf8->decode($response); +} + +sub is_work_tree_watched { + my ($output) = @_; + my $error = $output->{error}; + if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) { + $retry--; + my $response = qx/watchman watch "$git_work_tree"/; + die "Failed to make watchman watch '$git_work_tree'.\n" . + "Falling back to scanning...\n" if $? != 0; + $output = $json_pkg->new->utf8->decode($response); + $error = $output->{error}; + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + # Uncomment for debugging watchman output + # open (my $fh, ">", ".git/watchman-output.out"); + # close $fh; + + # Watchman will always return all files on the first query so + # return the fast "everything is dirty" flag to git and do the + # Watchman query just to get it over with now so we won't pay + # the cost in git to look up each individual file. + my $o = watchman_clock(); + $error = $output->{error}; + + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + output_result($o->{clock}, ("/")); + $last_update_token = $o->{clock}; + + eval { launch_watchman() }; + return 0; + } + + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + return 1; +} + +sub get_working_dir { + my $working_dir; + if ($^O =~ 'msys' || $^O =~ 'cygwin') { + $working_dir = Win32::GetCwd(); + $working_dir =~ tr/\\/\//; + } else { + require Cwd; + $working_dir = Cwd::cwd(); + } + + return $working_dir; +} + + +================================================ +File: /.git\hooks\post-update.sample +================================================ +#!/bin/sh +# +# An example hook script to prepare a packed repository for use over +# dumb transports. +# +# To enable this hook, rename this file to "post-update". + +exec git update-server-info + + +================================================ +File: /.git\hooks\pre-applypatch.sample +================================================ +#!/bin/sh +# +# An example hook script to verify what is about to be committed +# by applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-applypatch". + +. git-sh-setup +precommit="$(git rev-parse --git-path hooks/pre-commit)" +test -x "$precommit" && exec "$precommit" ${1+"$@"} +: + + +================================================ +File: /.git\hooks\pre-commit.sample +================================================ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by "git commit" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message if +# it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-commit". + +if git rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=$(git hash-object -t tree /dev/null) +fi + +# If you want to allow non-ASCII filenames set this variable to true. +allownonascii=$(git config --type=bool hooks.allownonascii) + +# Redirect output to stderr. +exec 1>&2 + +# Cross platform projects tend to avoid non-ASCII filenames; prevent +# them from being added to the repository. We exploit the fact that the +# printable range starts at the space character and ends with tilde. +if [ "$allownonascii" != "true" ] && + # Note that the use of brackets around a tr range is ok here, (it's + # even required, for portability to Solaris 10's /usr/bin/tr), since + # the square bracket bytes happen to fall in the designated range. + test $(git diff-index --cached --name-only --diff-filter=A -z $against | + LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0 +then + cat <<\EOF +Error: Attempt to add a non-ASCII file name. + +This can cause problems if you want to work with people on other platforms. + +To be portable it is advisable to rename the file. + +If you know what you are doing you can disable this check using: + + git config hooks.allownonascii true +EOF + exit 1 +fi + +# If there are whitespace errors, print the offending file names and fail. +exec git diff-index --check --cached $against -- + + +================================================ +File: /.git\hooks\pre-merge-commit.sample +================================================ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by "git merge" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message to +# stderr if it wants to stop the merge commit. +# +# To enable this hook, rename this file to "pre-merge-commit". + +. git-sh-setup +test -x "$GIT_DIR/hooks/pre-commit" && + exec "$GIT_DIR/hooks/pre-commit" +: + + +================================================ +File: /.git\hooks\pre-push.sample +================================================ +#!/bin/sh + +# An example hook script to verify what is about to be pushed. Called by "git +# push" after it has checked the remote status, but before anything has been +# pushed. If this script exits with a non-zero status nothing will be pushed. +# +# This hook is called with the following parameters: +# +# $1 -- Name of the remote to which the push is being done +# $2 -- URL to which the push is being done +# +# If pushing without using a named remote those arguments will be equal. +# +# Information about the commits which are being pushed is supplied as lines to +# the standard input in the form: +# +# +# +# This sample shows how to prevent push of commits where the log message starts +# with "WIP" (work in progress). + +remote="$1" +url="$2" + +zero=$(git hash-object --stdin &2 "Found WIP commit in $local_ref, not pushing" + exit 1 + fi + fi +done + +exit 0 + + +================================================ +File: /.git\hooks\pre-rebase.sample +================================================ +#!/bin/sh +# +# Copyright (c) 2006, 2008 Junio C Hamano +# +# The "pre-rebase" hook is run just before "git rebase" starts doing +# its job, and can prevent the command from running by exiting with +# non-zero status. +# +# The hook is called with the following parameters: +# +# $1 -- the upstream the series was forked from. +# $2 -- the branch being rebased (or empty when rebasing the current branch). +# +# This sample shows how to prevent topic branches that are already +# merged to 'next' branch from getting rebased, because allowing it +# would result in rebasing already published history. + +publish=next +basebranch="$1" +if test "$#" = 2 +then + topic="refs/heads/$2" +else + topic=`git symbolic-ref HEAD` || + exit 0 ;# we do not interrupt rebasing detached HEAD +fi + +case "$topic" in +refs/heads/??/*) + ;; +*) + exit 0 ;# we do not interrupt others. + ;; +esac + +# Now we are dealing with a topic branch being rebased +# on top of master. Is it OK to rebase it? + +# Does the topic really exist? +git show-ref -q "$topic" || { + echo >&2 "No such branch $topic" + exit 1 +} + +# Is topic fully merged to master? +not_in_master=`git rev-list --pretty=oneline ^master "$topic"` +if test -z "$not_in_master" +then + echo >&2 "$topic is fully merged to master; better remove it." + exit 1 ;# we could allow it, but there is no point. +fi + +# Is topic ever merged to next? If so you should not be rebasing it. +only_next_1=`git rev-list ^master "^$topic" ${publish} | sort` +only_next_2=`git rev-list ^master ${publish} | sort` +if test "$only_next_1" = "$only_next_2" +then + not_in_topic=`git rev-list "^$topic" master` + if test -z "$not_in_topic" + then + echo >&2 "$topic is already up to date with master" + exit 1 ;# we could allow it, but there is no point. + else + exit 0 + fi +else + not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` + /usr/bin/perl -e ' + my $topic = $ARGV[0]; + my $msg = "* $topic has commits already merged to public branch:\n"; + my (%not_in_next) = map { + /^([0-9a-f]+) /; + ($1 => 1); + } split(/\n/, $ARGV[1]); + for my $elem (map { + /^([0-9a-f]+) (.*)$/; + [$1 => $2]; + } split(/\n/, $ARGV[2])) { + if (!exists $not_in_next{$elem->[0]}) { + if ($msg) { + print STDERR $msg; + undef $msg; + } + print STDERR " $elem->[1]\n"; + } + } + ' "$topic" "$not_in_next" "$not_in_master" + exit 1 +fi + +<<\DOC_END + +This sample hook safeguards topic branches that have been +published from being rewound. + +The workflow assumed here is: + + * Once a topic branch forks from "master", "master" is never + merged into it again (either directly or indirectly). + + * Once a topic branch is fully cooked and merged into "master", + it is deleted. If you need to build on top of it to correct + earlier mistakes, a new topic branch is created by forking at + the tip of the "master". This is not strictly necessary, but + it makes it easier to keep your history simple. + + * Whenever you need to test or publish your changes to topic + branches, merge them into "next" branch. + +The script, being an example, hardcodes the publish branch name +to be "next", but it is trivial to make it configurable via +$GIT_DIR/config mechanism. + +With this workflow, you would want to know: + +(1) ... if a topic branch has ever been merged to "next". Young + topic branches can have stupid mistakes you would rather + clean up before publishing, and things that have not been + merged into other branches can be easily rebased without + affecting other people. But once it is published, you would + not want to rewind it. + +(2) ... if a topic branch has been fully merged to "master". + Then you can delete it. More importantly, you should not + build on top of it -- other people may already want to + change things related to the topic as patches against your + "master", so if you need further changes, it is better to + fork the topic (perhaps with the same name) afresh from the + tip of "master". + +Let's look at this example: + + o---o---o---o---o---o---o---o---o---o "next" + / / / / + / a---a---b A / / + / / / / + / / c---c---c---c B / + / / / \ / + / / / b---b C \ / + / / / / \ / + ---o---o---o---o---o---o---o---o---o---o---o "master" + + +A, B and C are topic branches. + + * A has one fix since it was merged up to "next". + + * B has finished. It has been fully merged up to "master" and "next", + and is ready to be deleted. + + * C has not merged to "next" at all. + +We would want to allow C to be rebased, refuse A, and encourage +B to be deleted. + +To compute (1): + + git rev-list ^master ^topic next + git rev-list ^master next + + if these match, topic has not merged in next at all. + +To compute (2): + + git rev-list master..topic + + if this is empty, it is fully merged to "master". + +DOC_END + + +================================================ +File: /.git\hooks\pre-receive.sample +================================================ +#!/bin/sh +# +# An example hook script to make use of push options. +# The example simply echoes all push options that start with 'echoback=' +# and rejects all pushes when the "reject" push option is used. +# +# To enable this hook, rename this file to "pre-receive". + +if test -n "$GIT_PUSH_OPTION_COUNT" +then + i=0 + while test "$i" -lt "$GIT_PUSH_OPTION_COUNT" + do + eval "value=\$GIT_PUSH_OPTION_$i" + case "$value" in + echoback=*) + echo "echo from the pre-receive-hook: ${value#*=}" >&2 + ;; + reject) + exit 1 + esac + i=$((i + 1)) + done +fi + + +================================================ +File: /.git\hooks\prepare-commit-msg.sample +================================================ +#!/bin/sh +# +# An example hook script to prepare the commit log message. +# Called by "git commit" with the name of the file that has the +# commit message, followed by the description of the commit +# message's source. The hook's purpose is to edit the commit +# message file. If the hook fails with a non-zero status, +# the commit is aborted. +# +# To enable this hook, rename this file to "prepare-commit-msg". + +# This hook includes three examples. The first one removes the +# "# Please enter the commit message..." help message. +# +# The second includes the output of "git diff --name-status -r" +# into the message, just before the "git status" output. It is +# commented because it doesn't cope with --amend or with squashed +# commits. +# +# The third example adds a Signed-off-by line to the message, that can +# still be edited. This is rarely a good idea. + +COMMIT_MSG_FILE=$1 +COMMIT_SOURCE=$2 +SHA1=$3 + +/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE" + +# case "$COMMIT_SOURCE,$SHA1" in +# ,|template,) +# /usr/bin/perl -i.bak -pe ' +# print "\n" . `git diff --cached --name-status -r` +# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;; +# *) ;; +# esac + +# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE" +# if test -z "$COMMIT_SOURCE" +# then +# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE" +# fi + + +================================================ +File: /.git\hooks\push-to-checkout.sample +================================================ +#!/bin/sh + +# An example hook script to update a checked-out tree on a git push. +# +# This hook is invoked by git-receive-pack(1) when it reacts to git +# push and updates reference(s) in its repository, and when the push +# tries to update the branch that is currently checked out and the +# receive.denyCurrentBranch configuration variable is set to +# updateInstead. +# +# By default, such a push is refused if the working tree and the index +# of the remote repository has any difference from the currently +# checked out commit; when both the working tree and the index match +# the current commit, they are updated to match the newly pushed tip +# of the branch. This hook is to be used to override the default +# behaviour; however the code below reimplements the default behaviour +# as a starting point for convenient modification. +# +# The hook receives the commit with which the tip of the current +# branch is going to be updated: +commit=$1 + +# It can exit with a non-zero status to refuse the push (when it does +# so, it must not modify the index or the working tree). +die () { + echo >&2 "$*" + exit 1 +} + +# Or it can make any necessary changes to the working tree and to the +# index to bring them to the desired state when the tip of the current +# branch is updated to the new commit, and exit with a zero status. +# +# For example, the hook can simply run git read-tree -u -m HEAD "$1" +# in order to emulate git fetch that is run in the reverse direction +# with git push, as the two-tree form of git read-tree -u -m is +# essentially the same as git switch or git checkout that switches +# branches while keeping the local changes in the working tree that do +# not interfere with the difference between the branches. + +# The below is a more-or-less exact translation to shell of the C code +# for the default behaviour for git's push-to-checkout hook defined in +# the push_to_deploy() function in builtin/receive-pack.c. +# +# Note that the hook will be executed from the repository directory, +# not from the working tree, so if you want to perform operations on +# the working tree, you will have to adapt your code accordingly, e.g. +# by adding "cd .." or using relative paths. + +if ! git update-index -q --ignore-submodules --refresh +then + die "Up-to-date check failed" +fi + +if ! git diff-files --quiet --ignore-submodules -- +then + die "Working directory has unstaged changes" +fi + +# This is a rough translation of: +# +# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX +if git cat-file -e HEAD 2>/dev/null +then + head=HEAD +else + head=$(git hash-object -t tree --stdin &2 + exit 1 +} + +unset GIT_DIR GIT_WORK_TREE +cd "$worktree" && + +if grep -q "^diff --git " "$1" +then + validate_patch "$1" +else + validate_cover_letter "$1" +fi && + +if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL" +then + git config --unset-all sendemail.validateWorktree && + trap 'git worktree remove -ff "$worktree"' EXIT && + validate_series +fi + + +================================================ +File: /.git\hooks\update.sample +================================================ +#!/bin/sh +# +# An example hook script to block unannotated tags from entering. +# Called by "git receive-pack" with arguments: refname sha1-old sha1-new +# +# To enable this hook, rename this file to "update". +# +# Config +# ------ +# hooks.allowunannotated +# This boolean sets whether unannotated tags will be allowed into the +# repository. By default they won't be. +# hooks.allowdeletetag +# This boolean sets whether deleting tags will be allowed in the +# repository. By default they won't be. +# hooks.allowmodifytag +# This boolean sets whether a tag may be modified after creation. By default +# it won't be. +# hooks.allowdeletebranch +# This boolean sets whether deleting branches will be allowed in the +# repository. By default they won't be. +# hooks.denycreatebranch +# This boolean sets whether remotely creating branches will be denied +# in the repository. By default this is allowed. +# + +# --- Command line +refname="$1" +oldrev="$2" +newrev="$3" + +# --- Safety check +if [ -z "$GIT_DIR" ]; then + echo "Don't run this script from the command line." >&2 + echo " (if you want, you could supply GIT_DIR then run" >&2 + echo " $0 )" >&2 + exit 1 +fi + +if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +# --- Config +allowunannotated=$(git config --type=bool hooks.allowunannotated) +allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch) +denycreatebranch=$(git config --type=bool hooks.denycreatebranch) +allowdeletetag=$(git config --type=bool hooks.allowdeletetag) +allowmodifytag=$(git config --type=bool hooks.allowmodifytag) + +# check for no description +projectdesc=$(sed -e '1q' "$GIT_DIR/description") +case "$projectdesc" in +"Unnamed repository"* | "") + echo "*** Project description file hasn't been set" >&2 + exit 1 + ;; +esac + +# --- Check types +# if $newrev is 0000...0000, it's a commit to delete a ref. +zero=$(git hash-object --stdin &2 + echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 + exit 1 + fi + ;; + refs/tags/*,delete) + # delete tag + if [ "$allowdeletetag" != "true" ]; then + echo "*** Deleting a tag is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/tags/*,tag) + # annotated tag + if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 + then + echo "*** Tag '$refname' already exists." >&2 + echo "*** Modifying a tag is not allowed in this repository." >&2 + exit 1 + fi + ;; + refs/heads/*,commit) + # branch + if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then + echo "*** Creating a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/heads/*,delete) + # delete branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/remotes/*,commit) + # tracking branch + ;; + refs/remotes/*,delete) + # delete tracking branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a tracking branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + *) + # Anything else (is there anything else?) + echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 + exit 1 + ;; +esac + +# --- Finished +exit 0 + + +================================================ +File: /.git\info\exclude +================================================ +# git ls-files --others --exclude-from=.git/info/exclude +# Lines that start with '#' are comments. +# For a project mostly in C, the following would be a good set of +# exclude patterns (uncomment them if you want to use them): +# *.[oa] +# *~ + + +================================================ +File: /.git\logs\HEAD +================================================ +0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git + + +================================================ +File: /.git\logs\refs\heads\main +================================================ +0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git + + +================================================ +File: /.git\logs\refs\remotes\origin\HEAD +================================================ +0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git + + +================================================ +File: /.git\packed-refs +================================================ +# pack-refs with: peeled fully-peeled sorted +38f59ba682df7942661d0907e68d38eab52e2d74 refs/remotes/origin/main + + +================================================ +File: /.git\refs\heads\main +================================================ +38f59ba682df7942661d0907e68d38eab52e2d74 + + +================================================ +File: /.git\refs\remotes\origin\HEAD +================================================ +ref: refs/remotes/origin/main + + +================================================ +File: /.github\workflows\unitest.yml +================================================ +name: Unit Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-asyncio + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -e . + + - name: Run tests + run: | + pytest + +================================================ +File: /.gitignore +================================================ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +tmp/* + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +.vscode/settings.json +.DS_Store + +# Project specific +history.txt +cleanup.py +Caddyfile + +# ignore default output directory +tmp/* + + +================================================ +File: /CODE_OF_CONDUCT.md +================================================ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +romain@coderamp.io. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. + + +================================================ +File: /Dockerfile +================================================ +# Build stage +FROM python:3.12-slim AS builder + +WORKDIR /build + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . + +# Install build dependencies and Python packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends gcc python3-dev \ + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir --timeout 1000 -r requirements.txt \ + && rm -rf /var/lib/apt/lists/* + +# Runtime stage +FROM python:3.12-slim + +# Set Python environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 + +# Install git +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Create a non-root user +RUN useradd -m -u 1000 appuser + +COPY --from=builder /usr/local/lib/python3.12/site-packages/ /usr/local/lib/python3.12/site-packages/ +COPY src/ ./ + +# Change ownership of the application files +RUN chown -R appuser:appuser /app + +# Switch to non-root user +USER appuser + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] + + +================================================ +File: /LICENSE +================================================ +MIT License + +Copyright (c) 2024 Romain Courtois + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +================================================ +File: /pytest.ini +================================================ +[pytest] +pythonpath = src +testpaths = src/gitingest/tests +asyncio_mode = auto + + +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +================================================ +File: /requirements.txt +================================================ +fastapi[standard] +uvicorn +fastapi-analytics +slowapi +tiktoken +pytest +pytest-asyncio +click>=8.0.0 + + +================================================ +File: /SECURITY.md +================================================ +# Security Policy + +## Reporting a Vulnerability + +If you have discovered a vulnerability inside the project, report it privately at romain@coderamp.io. This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. + + +================================================ +File: /setup.py +================================================ +from setuptools import setup, find_packages + +setup( + name="gitingest", + version="0.1.2", + packages=find_packages(where="src"), + package_dir={"": "src"}, + include_package_data=True, + install_requires=[ + "click>=8.0.0", + "tiktoken", + ], + entry_points={ + "console_scripts": [ + "gitingest=gitingest.cli:main", + ], + }, + python_requires=">=3.6", + author="Romain Courtois", + author_email="romain@coderamp.io", + description="CLI tool to analyze and create text dumps of codebases for LLMs", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/cyclotruc/gitingest", + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + ], +) + +================================================ +File: /src\config.py +================================================ +MAX_DISPLAY_SIZE = 300000 +TMP_BASE_PATH = "../tmp" + +EXAMPLE_REPOS = [ + {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, + {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, + {"name": "Flask", "url": "https://github.com/pallets/flask"}, + {"name": "Tldraw", "url": "https://github.com/tldraw/tldraw"}, + {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, +] + + +================================================ +File: /src\gitingest\cli.py +================================================ +import os +import pathlib +import click +import sys +from .encoding import setup_encoding + +# Setup encoding first +setup_encoding() + +# Define constants +MAX_FILE_SIZE = 51200 # 50KB by default +DEFAULT_IGNORE_PATTERNS = [] + +def normalize_pattern(pattern: str) -> str: + pattern = pattern.strip() + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + +@click.command() +@click.argument('source', type=str, required=True) +@click.option('--output', '-o', default=None, + help='Output file path (default: .txt in current directory)') +@click.option('--max-size', '-s', default=MAX_FILE_SIZE, + help='Maximum file size to process in bytes') +@click.option('--exclude-pattern', '-e', multiple=True, + help='Patterns to exclude') +@click.option('--include-pattern', '-i', multiple=True, + help='Patterns to include') +def main(source, output, max_size, exclude_pattern, include_pattern): + """Analyze a directory and create a text dump of its contents.""" + try: + from gitingest.ingest import ingest + + # Convert paths to absolute with proper encoding + source = str(pathlib.Path(source).resolve()) + + # Handle patterns + exclude_patterns = list(exclude_pattern) + include_patterns = list(set(include_pattern)) + + # Set default output name + if not output: + output = "digest.txt" + output = str(pathlib.Path(output).resolve()) + + # Call ingest with encoding awareness + summary, tree, content = ingest( + source, + max_size, + include_patterns, + exclude_patterns, + output=output + ) + + # Write output with explicit encoding + with open(output, 'w', encoding='utf-8', errors='replace') as f: + if isinstance(summary, bytes): + summary = summary.decode('utf-8', errors='replace') + if isinstance(tree, bytes): + tree = tree.decode('utf-8', errors='replace') + if isinstance(content, bytes): + content = content.decode('utf-8', errors='replace') + + f.write(f"{summary}\n\n{tree}\n\n{content}") + + # Print messages with encoding handling + click.echo(f"Analysis complete! Output written to: {output}") + click.echo("\nSummary:") + click.echo(summary) + + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + raise click.Abort() + +if __name__ == '__main__': + main() + +================================================ +File: /src\gitingest\clone.py +================================================ +import asyncio +from typing import Tuple + +from gitingest.utils import async_timeout + +CLONE_TIMEOUT = 20 + +async def check_repo_exists(url: str) -> bool: + proc = await asyncio.create_subprocess_exec( + "curl", + "-I", + url, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + return False + # Check if stdout contains "404" status code + stdout_str = stdout.decode() + return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + +@async_timeout(CLONE_TIMEOUT) +async def clone_repo(query: dict) -> str: + if not await check_repo_exists(query['url']): + raise ValueError("Repository not found, make sure it is public") + + if query['commit']: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--single-branch", + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + + proc = await asyncio.create_subprocess_exec( + "git", + "-C", + query['local_path'], + "checkout", + query['branch'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth=1", + "--single-branch", + "--branch", + query['branch'], + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + else: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth=1", + "--single-branch", + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await proc.communicate() + + return stdout, stderr + +================================================ +File: /src\gitingest\encoding.py +================================================ +import sys +import io +import codecs + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, + encoding='utf-8', + errors='replace' + ) + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, + encoding='utf-8', + errors='replace' + ) + +================================================ +File: /src\gitingest\ingest.py +================================================ +import asyncio +import shutil +from typing import Union, List +from pathlib import Path +import io +import sys + +# Import other modules from the package +from gitingest.parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + +def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str] = None, + exclude_patterns: Union[List[str], str] = None, + output: str = None) -> tuple[str, str, str]: + """ + Analyze and create a text dump of source contents. + + Args: + source: Path to source directory or git URL + max_file_size: Maximum file size to process in bytes + include_patterns: Patterns to include in analysis + exclude_patterns: Patterns to exclude from analysis + output: Output file path + + Returns: + Tuple of (summary, tree, content) + """ + setup_encoding() + query = None + + try: + query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) + if query['url']: + asyncio.run(clone_repo(query)) + + summary, tree, content = ingest_from_query(query) + + if output: + # Write with explicit UTF-8 encoding + with open(output, "w", encoding='utf-8', errors='replace') as f: + # Ensure all content is properly encoded + tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree + content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content + f.write(f"{tree}\n{content}") + + return summary, tree, content + + except UnicodeEncodeError as e: + # Handle encoding errors specifically + error_msg = f"Encoding error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + except Exception as e: + # Handle other errors + error_msg = f"Error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + finally: + # Clean up the temporary directory if it was created + if query and query.get('url'): + # Get parent directory two levels up from local_path (../tmp) + cleanup_path = str(Path(query['local_path']).parents[1]) + try: + shutil.rmtree(cleanup_path, ignore_errors=True) + except Exception as e: + print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr) + +================================================ +File: /src\gitingest\ingest_from_query.py +================================================ +import os +from fnmatch import fnmatch +from typing import Dict, List, Union +import tiktoken + + +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB +MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal +MAX_FILES = 10000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500MB + + +def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: + rel_path = path.replace(base_path, "").lstrip(os.sep) + include = False + for pattern in include_patterns: + if fnmatch(rel_path, pattern): + include = True + return include + +def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: + rel_path = path.replace(base_path, "").lstrip(os.sep) + for pattern in ignore_patterns: + if pattern == '': + continue + if fnmatch(rel_path, pattern): + return True + return False + +def is_safe_symlink(symlink_path: str, base_path: str) -> bool: + """Check if a symlink points to a location within the base directory.""" + try: + target_path = os.path.realpath(symlink_path) + base_path = os.path.realpath(base_path) + return os.path.commonpath([target_path, base_path]) == base_path + except (OSError, ValueError): + # If there's any error resolving the paths, consider it unsafe + return False + +def is_text_file(file_path: str) -> bool: + """Determines if a file is likely a text file based on its content.""" + try: + with open(file_path, 'rb') as file: + chunk = file.read(1024) + return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) + except IOError: + return False + +def read_file_content(file_path: str) -> str: + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + return f.read() + except Exception as e: + return f"Error reading file: {str(e)}" + +def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict: + """Recursively analyzes a directory and its contents with safety limits.""" + if seen_paths is None: + seen_paths = set() + if stats is None: + stats = {"total_files": 0, "total_size": 0} + + if depth > MAX_DIRECTORY_DEPTH: + print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)") + return None + + if stats["total_files"] >= MAX_FILES: + print(f"Skipping further processing: maximum file limit ({MAX_FILES}) reached") + return None + + if stats["total_size"] >= MAX_TOTAL_SIZE_BYTES: + print(f"Skipping further processing: maximum total size ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") + return None + + real_path = os.path.realpath(path) + if real_path in seen_paths: + print(f"Skipping already visited path: {path}") + return None + seen_paths.add(real_path) + + result = { + "name": os.path.basename(path), + "type": "directory", + "size": 0, + "children": [], + "file_count": 0, + "dir_count": 0, + "path": path, + "ignore_content": False + } + + ignore_patterns = query['ignore_patterns'] + base_path = query['local_path'] + include_patterns = query['include_patterns'] + + try: + for item in os.listdir(path): + item_path = os.path.join(path, item) + + if should_exclude(item_path, base_path, ignore_patterns): + continue + + is_file = os.path.isfile(item_path) + if is_file and query['include_patterns']: + if not should_include(item_path, base_path, include_patterns): + result["ignore_content"] = True + continue + + # Handle symlinks + if os.path.islink(item_path): + if not is_safe_symlink(item_path, base_path): + print(f"Skipping symlink that points outside base directory: {item_path}") + continue + real_path = os.path.realpath(item_path) + if real_path in seen_paths: + print(f"Skipping already visited symlink target: {item_path}") + continue + + if os.path.isfile(real_path): + file_size = os.path.getsize(real_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + continue + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return result + + is_text = is_text_file(real_path) + content = read_file_content(real_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(real_path): + subdir = scan_directory(real_path, query, seen_paths, depth + 1, stats) + if subdir and (not include_patterns or subdir["file_count"] > 0): + subdir["name"] = item + subdir["path"] = item_path + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + continue + + if os.path.isfile(item_path): + file_size = os.path.getsize(item_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + continue + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return result + + is_text = is_text_file(item_path) + content = read_file_content(item_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(item_path): + subdir = scan_directory(item_path, query, seen_paths, depth + 1, stats) + if subdir and (not include_patterns or subdir["file_count"] > 0): + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + + except PermissionError: + print(f"Permission denied: {path}") + + return result + +def extract_files_content(query: dict, node: Dict, max_file_size: int, files: List = None) -> List[Dict]: + """Recursively collects all text files with their contents.""" + if files is None: + files = [] + + if node["type"] == "file" and node["content"] != "[Non-text file]": + content = node["content"] + if node["size"] > max_file_size: + content = None + + files.append({ + "path": node["path"].replace(query['local_path'], ""), + "content": content, + "size": node["size"] + }) + elif node["type"] == "directory": + for child in node["children"]: + extract_files_content(query, child, max_file_size, files) + return files + +def create_file_content_string(files: List[Dict]) -> str: + """Creates a formatted string of file contents with separators.""" + output = "" + separator = "=" * 48 + "\n" + + # First add README.md if it exists + for file in files: + if not file['content']: + continue + if file['path'].lower() == '/readme.md': + output += separator + output += f"File: {file['path']}\n" + output += separator + output += f"{file['content']}\n\n" + break + + # Then add all other files in their original order + for file in files: + if not file['content'] or file['path'].lower() == '/readme.md': + continue + output += separator + output += f"File: {file['path']}\n" + output += separator + output += f"{file['content']}\n\n" + + return output + +def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: + """Creates a summary string with file counts and content size.""" + if "user_name" in query: + summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" + else: + summary = f"Repository: {query['slug']}\n" + summary += f"Files analyzed: {nodes['file_count']}\n" + + if 'subpath' in query and query['subpath'] != '/': + summary += f"Subpath: {query['subpath']}\n" + if 'commit' in query and query['commit']: + summary += f"Commit: {query['commit']}\n" + elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + summary += f"Branch: {query['branch']}\n" + return summary + +def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bool = True) -> str: + """Creates a tree-like string representation of the file structure.""" + tree = "" + if not node["name"]: + node["name"] = query['slug'] + + if node["name"]: + current_prefix = "└── " if is_last else "├── " + name = node["name"] + "/" if node["type"] == "directory" else node["name"] + tree += prefix + current_prefix + name + "\n" + if node["type"] == "directory": + # Adjust prefix only if we added a node name + new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix + children = node["children"] + for i, child in enumerate(children): + tree += create_tree_structure(query, child, new_prefix, i == len(children) - 1) + + return tree + +def generate_token_string(context_string: str) -> str: + """Returns the number of tokens in a text string.""" + formatted_tokens = "" + try: + encoding = tiktoken.get_encoding("cl100k_base", ) + total_tokens = len(encoding.encode(context_string, disallowed_special=())) + + except Exception as e: + print(e) + return None + if total_tokens > 1000000: + formatted_tokens = f"{total_tokens/1000000:.1f}M" + elif total_tokens > 1000: + formatted_tokens = f"{total_tokens/1000:.1f}k" + else: + formatted_tokens = f"{total_tokens}" + return formatted_tokens + +def ingest_single_file(path: str, query: dict) -> Dict: + if not os.path.isfile(path): + raise ValueError(f"Path {path} is not a file") + + file_size = os.path.getsize(path) + is_text = is_text_file(path) + if not is_text: + raise ValueError(f"File {path} is not a text file") + + content = read_file_content(path) + if file_size > query['max_file_size']: + content = "[Content ignored: file too large]" + + file_info = { + "path": path.replace(query['local_path'], ""), + "content": content, + "size": file_size + } + + summary = ( + f"Repository: {query['user_name']}/{query['repo_name']}\n" + f"File: {os.path.basename(path)}\n" + f"Size: {file_size:,} bytes\n" + f"Lines: {len(content.splitlines()):,}\n" + ) + + files_content = create_file_content_string([file_info]) + tree = "Directory structure:\n└── " + os.path.basename(path) + + formatted_tokens = generate_token_string(files_content) + if formatted_tokens: + summary += f"\nEstimated tokens: {formatted_tokens}" + return (summary, tree, files_content) + +def ingest_directory(path: str, query: dict) -> Dict: + nodes = scan_directory(path, query) + files = extract_files_content(query, nodes, query['max_file_size']) + summary = create_summary_string(query, nodes, files) + tree = "Directory structure:\n" + create_tree_structure(query, nodes) + files_content = create_file_content_string(files) + + formatted_tokens = generate_token_string(tree + files_content) + if formatted_tokens: + summary += f"\nEstimated tokens: {formatted_tokens}" + return (summary, tree, files_content) + +def ingest_from_query(query: dict) -> Dict: + """Main entry point for analyzing a codebase directory or single file.""" + path = f"{query['local_path']}{query['subpath']}" + if not os.path.exists(path): + raise ValueError(f"{query['slug']} cannot be found") + + if query.get('type') == 'blob': + return ingest_single_file(path, query) + else: + return ingest_directory(path, query) + + + +================================================ +File: /src\gitingest\parse_query.py +================================================ +from typing import List, Union +import uuid +import os + + +DEFAULT_IGNORE_PATTERNS = [ + # Python + '*.pyc', '*.pyo', '*.pyd', '__pycache__', '.pytest_cache', '.coverage', + '.tox', '.nox', '.mypy_cache', '.ruff_cache', '.hypothesis', + 'poetry.lock', 'Pipfile.lock', + + # JavaScript/Node + 'node_modules', 'bower_components', 'package-lock.json', 'yarn.lock', + '.npm', '.yarn', '.pnpm-store', + + # Version control + '.git', '.svn', '.hg', '.gitignore', '.gitattributes', '.gitmodules', + + # Images and media + '*.svg', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.pdf', + '*.mov', '*.mp4', '*.mp3', '*.wav', + + # Virtual environments + 'venv', '.venv', 'env', '.env', 'virtualenv', + + # IDEs and editors + '.idea', '.vscode', '.vs', '*.swp', '*.swo', '*.swn', + '.settings', '.project', '.classpath', '*.sublime-*', + + # Temporary and cache files + '*.log', '*.bak', '*.swp', '*.tmp', '*.temp', + '.cache', '.sass-cache', '.eslintcache', + '.DS_Store', 'Thumbs.db', 'desktop.ini', + + # Build directories and artifacts + 'build', 'dist', 'target', 'out', + '*.egg-info', '*.egg', '*.whl', + '*.so', '*.dylib', '*.dll', '*.class', + + # Documentation + 'site-packages', '.docusaurus', '.next', '.nuxt', + + # Other common patterns + '*.min.js', '*.min.css', # Minified files + '*.map', # Source maps + '.terraform', '*.tfstate*', # Terraform + 'vendor/', # Dependencies in various languages +] + +TMP_BASE_PATH = "../tmp" + +def parse_url(url: str) -> dict: + parsed = { + "user_name": None, + "repo_name": None, + "type": None, + "branch": None, + "commit": None, + "subpath": "/", + "local_path": None, + "url": None, + "slug": None, + "id": None, + } + + url = url.split(" ")[0] + if not url.startswith('https://'): + url = 'https://' + url + + # Extract domain and path + url_parts = url.split('/') + domain = url_parts[2] + path_parts = url_parts[3:] + + if len(path_parts) < 2: + raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") + + parsed["user_name"] = path_parts[0] + parsed["repo_name"] = path_parts[1] + + # Keep original URL format + parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" + parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" + parsed["id"] = str(uuid.uuid4()) + parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" + + if len(path_parts) > 3: + parsed["type"] = path_parts[2] + parsed["branch"] = path_parts[3] + if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): + parsed["commit"] = parsed['branch'] + + parsed["subpath"] = "/" + "/".join(path_parts[4:]) + return parsed + +def normalize_pattern(pattern: str) -> str: + pattern = pattern.strip() + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + +def parse_patterns(pattern: Union[List[str], str]) -> List[str]: + if isinstance(pattern, list): + pattern = ",".join(pattern) + + for p in pattern.split(","): + if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): + raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") + patterns = [normalize_pattern(p) for p in pattern.split(",")] + return patterns + +def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: + for pattern in include_patterns: + if pattern in ignore_patterns: + ignore_patterns.remove(pattern) + return ignore_patterns + + +def parse_path(path: str) -> dict: + + query = { + "local_path": os.path.abspath(path), + "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), + "subpath": "/", + "id": str(uuid.uuid4()), + "url": None, + } + return query + +def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: + if from_web: + query = parse_url(source) + else: + if source.startswith("https://") or "github.com" in source: + query = parse_url(source) + else: + query = parse_path(source) + query['max_file_size'] = max_file_size + + if ignore_patterns and ignore_patterns != "": + ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) + else: + ignore_patterns = DEFAULT_IGNORE_PATTERNS + + if include_patterns and include_patterns != "": + include_patterns = parse_patterns(include_patterns) + ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) + else: + include_patterns = None + + query['ignore_patterns'] = ignore_patterns + query['include_patterns'] = include_patterns + + return query + + + +================================================ +File: /src\gitingest\tests\conftest.py +================================================ +import os +import sys + +# Get the absolute path of the project root directory (one level up from tests) +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +# Add both the project root and src directory to PYTHONPATH +sys.path.insert(0, project_root) +sys.path.insert(0, os.path.join(project_root, 'src')) + +================================================ +File: /src\gitingest\tests\test_clone.py +================================================ +import pytest +from clone import clone_repo, check_repo_exists +from unittest.mock import patch, AsyncMock + +@pytest.mark.asyncio +async def test_clone_repo_with_commit(): + query = { + 'commit': 'a' * 40, # Simulating a valid commit hash + 'branch': 'main', + 'url': 'https://github.com/user/repo', + 'local_path': '/tmp/repo' + } + + with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'output', b'error') + mock_exec.return_value = mock_process + + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + assert mock_exec.call_count == 2 # Clone and checkout calls + +@pytest.mark.asyncio +async def test_clone_repo_without_commit(): + query = { + 'commit': None, + 'branch': 'main', + 'url': 'https://github.com/user/repo', + 'local_path': '/tmp/repo' + } + + with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'output', b'error') + mock_exec.return_value = mock_process + + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + assert mock_exec.call_count == 1 # Only clone call + +@pytest.mark.asyncio +async def test_clone_repo_nonexistent_repository(): + query = { + 'commit': None, + 'branch': 'main', + 'url': 'https://github.com/user/nonexistent-repo', + 'local_path': '/tmp/repo' + } + + with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: + with pytest.raises(ValueError, match="Repository not found"): + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + +@pytest.mark.asyncio +async def test_check_repo_exists(): + url = "https://github.com/user/repo" + + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') + mock_exec.return_value = mock_process + + # Test existing repository + mock_process.returncode = 0 + assert await check_repo_exists(url) is True + + # Test non-existing repository (404 response) + mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') + mock_process.returncode = 0 + assert await check_repo_exists(url) is False + + # Test failed request + mock_process.returncode = 1 + assert await check_repo_exists(url) is False + +================================================ +File: /src\gitingest\tests\test_ingest.py +================================================ +import pytest +from src.gitingest.ingest_from_query import ( + scan_directory, + extract_files_content, +) + +# Test fixtures +@pytest.fixture +def sample_query(): + return { + 'user_name': 'test_user', + 'repo_name': 'test_repo', + 'local_path': '/tmp/test_repo', + 'subpath': '/', + 'branch': 'main', + 'commit': None, + 'max_file_size': 1000000, + 'slug': 'test_user/test_repo', + 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], + 'include_patterns': None, + 'pattern_type': 'exclude' + + } + +@pytest.fixture +def temp_directory(tmp_path): + # Creates the following structure: + # test_repo/ + # ├── file1.txt + # ├── file2.py + # └── src/ + # | ├── subfile1.txt + # | └── subfile2.py + # | └── subdir/ + # | └── file_subdir.txt + # | └── file_subdir.py + # └── dir1/ + # | └── file_dir1.txt + # └── dir2/ + # └── file_dir2.txt + + test_dir = tmp_path / "test_repo" + test_dir.mkdir() + + # Root files + (test_dir / "file1.txt").write_text("Hello World") + (test_dir / "file2.py").write_text("print('Hello')") + + # src directory and its files + src_dir = test_dir / "src" + src_dir.mkdir() + (src_dir / "subfile1.txt").write_text("Hello from src") + (src_dir / "subfile2.py").write_text("print('Hello from src')") + + # src/subdir and its files + subdir = src_dir / "subdir" + subdir.mkdir() + (subdir / "file_subdir.txt").write_text("Hello from subdir") + (subdir / "file_subdir.py").write_text("print('Hello from subdir')") + + # dir1 and its file + dir1 = test_dir / "dir1" + dir1.mkdir() + (dir1 / "file_dir1.txt").write_text("Hello from dir1") + + # dir2 and its file + dir2 = test_dir / "dir2" + dir2.mkdir() + (dir2 / "file_dir2.txt").write_text("Hello from dir2") + + return test_dir + +def test_scan_directory(temp_directory, sample_query): + result = scan_directory( + str(temp_directory), + query=sample_query + ) + + assert result['type'] == 'directory' + assert result['file_count'] == 8 # All .txt and .py files + assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 + assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 + +def test_extract_files_content(temp_directory, sample_query): + nodes = scan_directory( + str(temp_directory), + query=sample_query + ) + + files = extract_files_content(sample_query, nodes, max_file_size=1000000) + assert len(files) == 8 # All .txt and .py files + + # Check for presence of key files + paths = [f['path'] for f in files] + assert any('file1.txt' in p for p in paths) + assert any('subfile1.txt' in p for p in paths) + assert any('file2.py' in p for p in paths) + assert any('subfile2.py' in p for p in paths) + assert any('file_subdir.txt' in p for p in paths) + assert any('file_dir1.txt' in p for p in paths) + assert any('file_dir2.txt' in p for p in paths) + + + +# TODO: test with include patterns: ['*.txt'] +# TODO: test with wrong include patterns: ['*.qwerty'] + + +#single folder patterns +# TODO: test with include patterns: ['src/*'] +# TODO: test with include patterns: ['/src/*'] +# TODO: test with include patterns: ['/src/'] +# TODO: test with include patterns: ['/src*'] + +#multiple patterns +# TODO: test with multiple include patterns: ['*.txt', '*.py'] +# TODO: test with multiple include patterns: ['/src/*', '*.txt'] +# TODO: test with multiple include patterns: ['/src*', '*.txt'] + + + + + + +================================================ +File: /src\gitingest\tests\test_parse_query.py +================================================ +import pytest +from gitingest.parse_query import parse_query, parse_url, DEFAULT_IGNORE_PATTERNS + + +def test_parse_url_valid(): + test_cases = [ + "https://github.com/user/repo", + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo" + ] + for url in test_cases: + result = parse_url(url) + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["url"] == url + +def test_parse_url_invalid(): + url = "https://only-domain.com" + with pytest.raises(ValueError, match="Invalid repository URL"): + parse_url(url) + +def test_parse_query_basic(): + test_cases = [ + "https://github.com/user/repo", + "https://gitlab.com/user/repo" + ] + for url in test_cases: + result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["url"] == url + assert "*.txt" in result["ignore_patterns"] + +def test_parse_query_include_pattern(): + url = "https://github.com/user/repo" + result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') + assert result["include_patterns"] == ["*.py"] + assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + +def test_parse_query_invalid_pattern(): + url = "https://github.com/user/repo" + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): + parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') + +================================================ +File: /src\gitingest\utils.py +================================================ + +## Async Timeout decorator +import asyncio +import functools +from typing import TypeVar, Callable + +T = TypeVar("T") + +class AsyncTimeoutError(Exception): + """Raised when an async operation exceeds its timeout limit.""" + pass + +def async_timeout(seconds: int = 10): + def decorator(func: Callable[..., T]) -> Callable[..., T]: + @functools.wraps(func) + async def wrapper(*args, **kwargs) -> T: + try: + return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) + except asyncio.TimeoutError: + raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") + return wrapper + return decorator + +================================================ +File: /src\gitingest\__init__.py +================================================ +from .ingest import ingest +from .parse_query import parse_query +from .clone import clone_repo +from .ingest_from_query import ingest_from_query + +__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query'] + +================================================ +File: /src\main.py +================================================ +import os +from dotenv import load_dotenv + +from fastapi import FastAPI, Request +from fastapi.templating import Jinja2Templates +from fastapi.responses import HTMLResponse, FileResponse, Response +from fastapi.staticfiles import StaticFiles +from starlette.middleware.trustedhost import TrustedHostMiddleware +from api_analytics.fastapi import Analytics +from slowapi import _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded + +from server_utils import limiter +from routers import download, dynamic, index + + +load_dotenv() + +app = FastAPI() +app.state.limiter = limiter +app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + +app.mount("/static", StaticFiles(directory="static"), name="static") +app.add_middleware(Analytics, api_key=os.getenv('API_ANALYTICS_KEY')) + +# Define the default allowed hosts +default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] + +# Fetch allowed hosts from the environment variable or use the default +allowed_hosts = os.getenv("ALLOWED_HOSTS") +if allowed_hosts: + allowed_hosts = allowed_hosts.split(",") +else: + allowed_hosts = default_allowed_hosts + +app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) +templates = Jinja2Templates(directory="templates") + +@app.get("/health") +async def health_check(): + return {"status": "healthy"} + +@app.head("/") +async def head_root(): + """Mirror the headers and status code of the index page""" + return HTMLResponse( + content=None, + headers={ + "content-type": "text/html; charset=utf-8" + } + ) + +@app.get("/api/", response_class=HTMLResponse) +@app.get("/api", response_class=HTMLResponse) +async def api_docs(request: Request): + return templates.TemplateResponse( + "api.jinja", {"request": request} + ) + +@app.get("/robots.txt") +async def robots(): + return FileResponse('static/robots.txt') + +app.include_router(index) +app.include_router(download) +app.include_router(dynamic) + +================================================ +File: /src\process_query.py +================================================ +from typing import List +from fastapi.templating import Jinja2Templates +from fastapi import Request + +from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS +from gitingest import ingest_from_query, clone_repo, parse_query +from server_utils import logSliderToSize, Colors + +templates = Jinja2Templates(directory="templates") + +def print_query(query, request, max_file_size, pattern_type, pattern): + print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") + if int(max_file_size/1024) != 50: + print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") + if pattern_type == "include" and pattern != "": + print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + elif pattern_type == "exclude" and pattern != "": + print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + + +def print_error(query, request, e, max_file_size, pattern_type, pattern): + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print_query(query, request, max_file_size, pattern_type, pattern) + print(f" | {Colors.RED}{e}{Colors.END}") + +def print_success(query, request, max_file_size, pattern_type, pattern, summary): + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") + print_query(query, request, max_file_size, pattern_type, pattern) + print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") + + + +async def process_query(request: Request, input_text: str, slider_position: int, pattern_type: str = "exclude", pattern: str = "", is_index: bool = False) -> str: + template = "index.jinja" if is_index else "github.jinja" + max_file_size = logSliderToSize(slider_position) + if pattern_type == "include": + include_patterns = pattern + exclude_patterns = None + elif pattern_type == "exclude": + exclude_patterns = pattern + include_patterns = None + try: + query = parse_query(input_text, max_file_size, True, include_patterns, exclude_patterns) + await clone_repo(query) + summary, tree, content = ingest_from_query(query) + with open(f"{query['local_path']}.txt", "w") as f: + f.write(tree + "\n" + content) + + + + except Exception as e: + #hack to print error message when query is not defined + if 'query' in locals() and query is not None and isinstance(query, dict): + print_error(query, request, e, max_file_size, pattern_type, pattern) + else: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print(f"{Colors.RED}{e}{Colors.END}") + return templates.TemplateResponse( + template, + { + "request": request, + "github_url": input_text, + "error_message": f"Error: {e}", + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } + ) + + if len(content) > MAX_DISPLAY_SIZE: + content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + print_success(query, request, max_file_size, pattern_type, pattern, summary) + return templates.TemplateResponse( + template, + { + "request": request, + "github_url": input_text, + "result": True, + "summary": summary, + "tree": tree, + "content": content, + "examples": EXAMPLE_REPOS if is_index else [], + "ingest_id": query['id'], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } + ) + + +================================================ +File: /src\routers\download.py +================================================ +from fastapi import HTTPException, APIRouter +from fastapi.responses import Response +from config import TMP_BASE_PATH +import os + +router = APIRouter() + +@router.get("/download/{digest_id}") +async def download_ingest(digest_id: str): + try: + # Find the first .txt file in the directory + directory = f"{TMP_BASE_PATH}/{digest_id}" + txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] + + if not txt_files: + raise FileNotFoundError("No .txt file found") + + with open(f"{directory}/{txt_files[0]}", "r") as f: + content = f.read() + + return Response( + content=content, + media_type="text/plain", + headers={ + "Content-Disposition": f"attachment; filename={txt_files[0]}" + } + ) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Digest not found") + +================================================ +File: /src\routers\dynamic.py +================================================ +from fastapi import APIRouter, Request, Form +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from process_query import process_query +from server_utils import limiter + +router = APIRouter() +templates = Jinja2Templates(directory="templates") + +@router.get("/{full_path:path}") +async def catch_all(request: Request, full_path: str): + return templates.TemplateResponse( + "github.jinja", + { + "request": request, + "github_url": f"https://github.com/{full_path}", + "loading": True, + "default_file_size": 243 + } + ) + +@router.post("/{full_path:path}", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def process_catch_all( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...) +): + return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=False) + + +================================================ +File: /src\routers\index.py +================================================ +from fastapi import APIRouter, Request, Form +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from server_utils import limiter +from process_query import process_query +from config import EXAMPLE_REPOS + + +router = APIRouter() +templates = Jinja2Templates(directory="templates") + + +@router.get("/", response_class=HTMLResponse) +async def home(request: Request): + return templates.TemplateResponse( + "index.jinja", + { + "request": request, + "examples": EXAMPLE_REPOS, + "default_file_size": 243 + } + ) + + +@router.post("/", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def index_post( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...) +): + return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=True) + + + + + + + +================================================ +File: /src\routers\__init__.py +================================================ +from .download import router as download +from .dynamic import router as dynamic +from .index import router as index + +__all__ = ["download", "dynamic", "index"] + +================================================ +File: /src\server_utils.py +================================================ +## Rate Limiter +from slowapi import Limiter +from slowapi.util import get_remote_address +limiter = Limiter(key_func=get_remote_address) + +## Logarithmic slider to file size +import math +def logSliderToSize(position): + """Convert slider position to file size in KB""" + maxp = 500 + minv = math.log(1) + maxv = math.log(102400) + + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + +## Color printing utility +class Colors: + """ANSI color codes""" + BLACK = "\033[0;30m" + RED = "\033[0;31m" + GREEN = "\033[0;32m" + BROWN = "\033[0;33m" + BLUE = "\033[0;34m" + PURPLE = "\033[0;35m" + CYAN = "\033[0;36m" + LIGHT_GRAY = "\033[0;37m" + DARK_GRAY = "\033[1;30m" + LIGHT_RED = "\033[1;31m" + LIGHT_GREEN = "\033[1;32m" + YELLOW = "\033[1;33m" + LIGHT_BLUE = "\033[1;34m" + LIGHT_PURPLE = "\033[1;35m" + LIGHT_CYAN = "\033[1;36m" + WHITE = "\033[1;37m" + BOLD = "\033[1m" + FAINT = "\033[2m" + ITALIC = "\033[3m" + UNDERLINE = "\033[4m" + BLINK = "\033[5m" + NEGATIVE = "\033[7m" + CROSSED = "\033[9m" + END = "\033[0m" + + +================================================ +File: /src\static\js\snow.js +================================================ +// Snow effect initialization +function initSnow() { + const snowCanvas = document.getElementById('snow-canvas'); + const ctx = snowCanvas.getContext('2d'); + + // Configure snow + const snowflakes = []; + const maxSnowflakes = 50; + const spawnInterval = 200; + let currentSnowflakes = 0; + let lastSpawnTime = 0; + + // Resize canvas to window size + function resizeCanvas() { + snowCanvas.width = window.innerWidth; + snowCanvas.height = window.innerHeight; + } + + // Initial setup + resizeCanvas(); + window.addEventListener('resize', resizeCanvas); + + // Snowflake class definition + class Snowflake { + constructor() { + this.reset(); + } + + reset() { + this.x = Math.random() * snowCanvas.width; + this.y = 0; + this.size = Math.random() * 3 + 2; + this.speed = Math.random() * 1 + 0.5; + this.wind = Math.random() * 0.5 - 0.25; + } + + update() { + this.y += this.speed; + this.x += this.wind; + + if (this.y > snowCanvas.height) { + this.reset(); + } + } + + draw() { + ctx.save(); + + ctx.shadowColor = 'rgba(0, 0, 0, 0.3)'; + ctx.shadowBlur = 5; + ctx.shadowOffsetX = 2; + ctx.shadowOffsetY = 2; + + ctx.beginPath(); + ctx.arc(this.x, this.y, this.size, 0, Math.PI * 2); + ctx.fillStyle = 'rgba(255, 255, 255, 1)'; + ctx.fill(); + + ctx.strokeStyle = 'rgba(200, 200, 200, 0.8)'; + ctx.lineWidth = 0.5; + ctx.stroke(); + + ctx.restore(); + } + } + + function animate(currentTime) { + ctx.clearRect(0, 0, snowCanvas.width, snowCanvas.height); + + if (currentSnowflakes < maxSnowflakes && currentTime - lastSpawnTime > spawnInterval) { + snowflakes.push(new Snowflake()); + currentSnowflakes++; + lastSpawnTime = currentTime; + } + + snowflakes.forEach(snowflake => { + snowflake.update(); + snowflake.draw(); + }); + + requestAnimationFrame(animate); + } + + requestAnimationFrame(animate); +} + +// Initialize snow when DOM content is loaded +document.addEventListener('DOMContentLoaded', initSnow); + +// Also initialize when the HTMX content is swapped +document.addEventListener('htmx:afterSettle', initSnow); + +================================================ +File: /src\static\js\utils.js +================================================ +// Copy functionality +function copyText(className) { + const textarea = document.querySelector('.' + className); + const button = document.querySelector(`button[onclick="copyText('${className}')"]`); + if (!textarea || !button) return; + + // Copy text + navigator.clipboard.writeText(textarea.value) + .then(() => { + // Store original content + const originalContent = button.innerHTML; + + // Change button content + button.innerHTML = 'Copied!'; + + // Reset after 1 second + setTimeout(() => { + button.innerHTML = originalContent; + }, 1000); + }) + .catch(err => { + // Show error in button + const originalContent = button.innerHTML; + button.innerHTML = 'Failed to copy'; + setTimeout(() => { + button.innerHTML = originalContent; + }, 1000); + }); +} + + +function handleSubmit(event, showLoading = false) { + event.preventDefault(); + const form = event.target || document.getElementById('ingestForm'); + if (!form) return; + + const submitButton = form.querySelector('button[type="submit"]'); + if (!submitButton) return; + + const formData = new FormData(form); + + // Update file size + const slider = document.getElementById('file_size'); + if (slider) { + formData.delete('max_file_size'); + formData.append('max_file_size', slider.value); + } + + // Update pattern type and pattern + const patternType = document.getElementById('pattern_type'); + const pattern = document.getElementById('pattern'); + if (patternType && pattern) { + formData.delete('pattern_type'); + formData.delete('pattern'); + formData.append('pattern_type', patternType.value); + formData.append('pattern', pattern.value); + } + + const originalContent = submitButton.innerHTML; + const currentStars = document.getElementById('github-stars')?.textContent; + + if (showLoading) { + submitButton.disabled = true; + submitButton.innerHTML = ` +
+ + + + + Processing... +
+ `; + submitButton.classList.add('bg-[#ffb14d]'); + } + + // Submit the form + fetch(form.action, { + method: 'POST', + body: formData + }) + .then(response => response.text()) + .then(html => { + // Store the star count before updating the DOM + const starCount = currentStars; + + + // TEMPORARY SNOW LOGIC // + const parser = new DOMParser(); + const newDoc = parser.parseFromString(html, 'text/html'); + + const existingCanvas = document.getElementById('snow-canvas'); + document.body.innerHTML = newDoc.body.innerHTML; + if (existingCanvas) { + document.body.insertBefore(existingCanvas, document.body.firstChild); + } + // END TEMPORARY SNOW LOGIC // + + // Wait for next tick to ensure DOM is updated + setTimeout(() => { + // Reinitialize slider functionality + initializeSlider(); + + const starsElement = document.getElementById('github-stars'); + if (starsElement && starCount) { + starsElement.textContent = starCount; + } + + // Scroll to results if they exist + const resultsSection = document.querySelector('[data-results]'); + if (resultsSection) { + resultsSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); + } + }, 0); + }) + .catch(error => { + submitButton.disabled = false; + submitButton.innerHTML = originalContent; + }); +} + +function copyFullDigest() { + const directoryStructure = document.querySelector('.directory-structure').value; + const filesContent = document.querySelector('.result-text').value; + const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; + const button = document.querySelector('[onclick="copyFullDigest()"]'); + const originalText = button.innerHTML; + + navigator.clipboard.writeText(fullDigest).then(() => { + button.innerHTML = ` + + + + Copied! + `; + + setTimeout(() => { + button.innerHTML = originalText; + }, 2000); + }).catch(err => { + console.error('Failed to copy text: ', err); + }); +} + +// Add the logSliderToSize helper function +function logSliderToSize(position) { + const minp = 0; + const maxp = 500; + const minv = Math.log(1); + const maxv = Math.log(102400); + + const value = Math.exp(minv + (maxv - minv) * Math.pow(position / maxp, 1.5)); + return Math.round(value); +} + +// Move slider initialization to a separate function +function initializeSlider() { + const slider = document.getElementById('file_size'); + const sizeValue = document.getElementById('size_value'); + + if (!slider || !sizeValue) return; + + function updateSlider() { + const value = logSliderToSize(slider.value); + sizeValue.textContent = formatSize(value); + slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; + } + + // Update on slider change + slider.addEventListener('input', updateSlider); + + // Initialize slider position + updateSlider(); +} + +// Add helper function for formatting size +function formatSize(sizeInKB) { + if (sizeInKB >= 1024) { + return Math.round(sizeInKB / 1024) + 'mb'; + } + return Math.round(sizeInKB) + 'kb'; +} + +// Initialize slider on page load +document.addEventListener('DOMContentLoaded', initializeSlider); + +// Make sure these are available globally +window.copyText = copyText; + +window.handleSubmit = handleSubmit; +window.initializeSlider = initializeSlider; +window.formatSize = formatSize; + +// Add this new function +function setupGlobalEnterHandler() { + document.addEventListener('keydown', function (event) { + if (event.key === 'Enter' && !event.target.matches('textarea')) { + const form = document.getElementById('ingestForm'); + if (form) { + handleSubmit(new Event('submit'), true); + } + } + }); +} + +// Add to the DOMContentLoaded event listener +document.addEventListener('DOMContentLoaded', () => { + initializeSlider(); + setupGlobalEnterHandler(); +}); + + +================================================ +File: /src\static\robots.txt +================================================ +User-agent: * +Allow: / +Allow: /api/ +Allow: /cyclotruc/gitingest/ + + + +================================================ +File: /src\templates\api.jinja +================================================ +{% extends "base.jinja" %} + +{% block title %}Git ingest API{% endblock %} + +{% block content %} +
+
+
+

API Documentation

+ + +
+
+
+
+ + + +
+
+

+ The API is currently under development.. +

+
+
+
+

+ We're working on making our API available to the public. + In the meantime, you can + + open an issue on github + + to suggest features. +

+
+
+
+{% endblock %} + +================================================ +File: /src\templates\base.jinja +================================================ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {% block title %}Git ingest{% endblock %} + + + + + + {% block extra_head %}{% endblock %} + + + + + {% include 'components/navbar.jinja' %} + + +
+
+ {% block content %}{% endblock %} +
+
+ + {% include 'components/footer.jinja' %} + + {% block extra_scripts %}{% endblock %} + + + +================================================ +File: /src\templates\components\footer.jinja +================================================ + + +================================================ +File: /src\templates\components\github_form.jinja +================================================ +
+
+
+ + +
+
+
+ +
+
+
+ +
+ + +
+ +
+ +
+
+
+
+
+ + + + +
+ +
+
+
+ +
+ + +
+
+ + {% if show_examples %} + +
+

Try these example repositories:

+
+ {% for example in examples %} + + {% endfor %} +
+
+ {% endif %} +
+
+ +================================================ +File: /src\templates\components\navbar.jinja +================================================ + + +
+
+
+ + + + + +
+
+
+ +================================================ +File: /src\templates\components\result.jinja +================================================ +{% if result %} +
+
+
+
+ +
+ +
+
+

Summary

+
+ + +
+
+
+ +
+ {% if ingest_id %} + +
+
+
+ +
+ {% endif %} + + +
+ + +
+
+

Directory Structure

+
+
+
+ +
+
+
+
+
+ +
+
+
+ + +
+
+

Files Content

+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+{% endif %} + +================================================ +File: /src\templates\github.jinja +================================================ +{% extends "base.jinja" %} + +{% block content %} +{% if error_message %} +
+ {{ error_message }} +
+{% endif %} + +{% with is_index=true, show_examples=false %} + {% include 'components/github_form.jinja' %} +{% endwith %} + +{% if loading %} +
+
+
+
+

Loading...

+
+
+{% endif %} + +{% include 'components/result.jinja' %} +{% endblock content %} + +{% block extra_scripts %} + +{% endblock extra_scripts %} + +================================================ +File: /src\templates\index.jinja +================================================ +{% extends "base.jinja" %} + +{% block extra_head %} + +{% endblock %} + +{% block content %} +
+
+ + + + +

+ Prompt-friendly
codebase  +

+ +
+

+ Turn any Git repository into a simple text ingest of its codebase. +

+

+ This is useful for feeding a codebase into any LLM. +

+

+ You can also replace 'hub' with 'ingest' in any Github URL +

+
+ +{% if error_message %} +
+ {{ error_message }} +
+{% endif %} + +{% with is_index=true, show_examples=true %} + {% include 'components/github_form.jinja' %} +{% endwith %} + +{% include 'components/result.jinja' %} + + + + +{% endblock %} + diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index ed84b214..07417b94 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,6 +1,6 @@ -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query from .ingest import ingest +from .parse_query import parse_query +from .clone import clone_repo +from .ingest_from_query import ingest_from_query -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] \ No newline at end of file +__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query'] \ No newline at end of file diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 81823e63..0886c638 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,10 +1,15 @@ import os import pathlib import click +import sys +from .encoding import setup_encoding -from gitingest.ingest import ingest -from gitingest.ingest_from_query import MAX_FILE_SIZE -from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS +# Setup encoding first +setup_encoding() + +# Define constants +MAX_FILE_SIZE = 51200 # 50KB by default +DEFAULT_IGNORE_PATTERNS = [] def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() @@ -15,21 +20,52 @@ def normalize_pattern(pattern: str) -> str: @click.command() @click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') +@click.option('--output', '-o', default=None, + help='Output file path (default: .txt in current directory)') +@click.option('--max-size', '-s', default=MAX_FILE_SIZE, + help='Maximum file size to process in bytes') +@click.option('--exclude-pattern', '-e', multiple=True, + help='Patterns to exclude') +@click.option('--include-pattern', '-i', multiple=True, + help='Patterns to include') def main(source, output, max_size, exclude_pattern, include_pattern): """Analyze a directory and create a text dump of its contents.""" try: - # Combine default and custom ignore patterns + from gitingest.ingest import ingest + + # Convert paths to absolute with proper encoding + source = str(pathlib.Path(source).resolve()) + + # Handle patterns exclude_patterns = list(exclude_pattern) include_patterns = list(set(include_pattern)) + # Set default output name if not output: output = "digest.txt" - summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) + output = str(pathlib.Path(output).resolve()) + + # Call ingest with encoding awareness + summary, tree, content = ingest( + source, + max_size, + include_patterns, + exclude_patterns, + output=output + ) + + # Write output with explicit encoding + with open(output, 'w', encoding='utf-8', errors='replace') as f: + if isinstance(summary, bytes): + summary = summary.decode('utf-8', errors='replace') + if isinstance(tree, bytes): + tree = tree.decode('utf-8', errors='replace') + if isinstance(content, bytes): + content = content.decode('utf-8', errors='replace') + + f.write(f"{summary}\n\n{tree}\n\n{content}") + # Print messages with encoding handling click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) @@ -39,4 +75,4 @@ def main(source, output, max_size, exclude_pattern, include_pattern): raise click.Abort() if __name__ == '__main__': - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/src/gitingest/encoding.py b/src/gitingest/encoding.py new file mode 100644 index 00000000..f4e10578 --- /dev/null +++ b/src/gitingest/encoding.py @@ -0,0 +1,17 @@ +import sys +import io +import codecs + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, + encoding='utf-8', + errors='replace' + ) + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, + encoding='utf-8', + errors='replace' + ) \ No newline at end of file diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index eac20818..6b3e957a 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -2,12 +2,40 @@ import shutil from typing import Union, List from pathlib import Path +import io +import sys -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query +# Import other modules from the package +from gitingest.parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str: +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + +def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str] = None, + exclude_patterns: Union[List[str], str] = None, + output: str = None) -> tuple[str, str, str]: + """ + Analyze and create a text dump of source contents. + + Args: + source: Path to source directory or git URL + max_file_size: Maximum file size to process in bytes + include_patterns: Patterns to include in analysis + exclude_patterns: Patterns to exclude from analysis + output: Output file path + + Returns: + Tuple of (summary, tree, content) + """ + setup_encoding() + query = None + try: query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) if query['url']: @@ -16,13 +44,31 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: summary, tree, content = ingest_from_query(query) if output: - with open(f"{output}", "w") as f: - f.write(tree + "\n" + content) + # Write with explicit UTF-8 encoding + with open(output, "w", encoding='utf-8', errors='replace') as f: + # Ensure all content is properly encoded + tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree + content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content + f.write(f"{tree}\n{content}") return summary, tree, content + + except UnicodeEncodeError as e: + # Handle encoding errors specifically + error_msg = f"Encoding error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + except Exception as e: + # Handle other errors + error_msg = f"Error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + finally: # Clean up the temporary directory if it was created - if query['url']: + if query and query.get('url'): # Get parent directory two levels up from local_path (../tmp) cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) \ No newline at end of file + try: + shutil.rmtree(cleanup_path, ignore_errors=True) + except Exception as e: + print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr) \ No newline at end of file From a193a5584bd72a14d348702c9779ad432a98b075 Mon Sep 17 00:00:00 2001 From: Dai Hung PHAM Date: Fri, 27 Dec 2024 11:11:22 +0100 Subject: [PATCH 2/6] encoding='utf-8' correction --- digest.txt | 4155 +++++++++++++++++++++++++++++++++++++ src/gitingest/__init__.py | 8 +- src/gitingest/cli.py | 56 +- src/gitingest/encoding.py | 17 + src/gitingest/ingest.py | 62 +- 5 files changed, 4276 insertions(+), 22 deletions(-) create mode 100644 digest.txt create mode 100644 src/gitingest/encoding.py diff --git a/digest.txt b/digest.txt new file mode 100644 index 00000000..bbc85aa4 --- /dev/null +++ b/digest.txt @@ -0,0 +1,4155 @@ +Repository: __temp/gitingest +Files analyzed: 76 + +Estimated tokens: 35.5k + +Directory structure: +└── __temp/gitingest/ + ├── .dockerignore + ├── .env + ├── .git/ + │ ├── config + │ ├── description + │ ├── HEAD + │ ├── hooks/ + │ │ ├── applypatch-msg.sample + │ │ ├── commit-msg.sample + │ │ ├── fsmonitor-watchman.sample + │ │ ├── post-update.sample + │ │ ├── pre-applypatch.sample + │ │ ├── pre-commit.sample + │ │ ├── pre-merge-commit.sample + │ │ ├── pre-push.sample + │ │ ├── pre-rebase.sample + │ │ ├── pre-receive.sample + │ │ ├── prepare-commit-msg.sample + │ │ ├── push-to-checkout.sample + │ │ ├── sendemail-validate.sample + │ │ └── update.sample + │ ├── index + │ ├── info/ + │ │ └── exclude + │ ├── logs/ + │ │ ├── HEAD + │ │ └── refs/ + │ │ ├── heads/ + │ │ │ └── main + │ │ └── remotes/ + │ │ └── origin/ + │ │ └── HEAD + │ ├── objects/ + │ │ ├── info/ + │ │ └── pack/ + │ │ ├── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.idx + │ │ ├── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.pack + │ │ └── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.rev + │ ├── packed-refs + │ └── refs/ + │ ├── heads/ + │ │ └── main + │ ├── remotes/ + │ │ └── origin/ + │ │ └── HEAD + │ └── tags/ + ├── .github/ + │ └── workflows/ + │ └── unitest.yml + ├── .gitignore + ├── .venv/ + │ ├── Lib/ + │ │ └── site-packages/ + │ │ ├── charset_normalizer/ + │ │ ├── httptools/ + │ │ │ └── parser/ + │ │ ├── markupsafe/ + │ │ ├── pydantic_core/ + │ │ ├── regex/ + │ │ ├── tiktoken/ + │ │ ├── watchfiles/ + │ │ ├── websockets/ + │ │ └── wrapt/ + │ └── Scripts/ + │ ├── python.exe + │ └── uvicorn.exe + ├── CODE_OF_CONDUCT.md + ├── digest.txt + ├── Dockerfile + ├── docs/ + ├── LICENSE + ├── pytest.ini + ├── README.md + ├── requirements.txt + ├── SECURITY.md + ├── setup.py + └── src/ + ├── config.py + ├── gitingest/ + │ ├── cli.py + │ ├── clone.py + │ ├── encoding.py + │ ├── ingest.py + │ ├── ingest_from_query.py + │ ├── parse_query.py + │ ├── tests/ + │ │ ├── conftest.py + │ │ ├── test_clone.py + │ │ ├── test_ingest.py + │ │ ├── test_parse_query.py + │ │ └── __init__.py + │ ├── utils.py + │ ├── __init__.py + │ └── __pycache__/ + ├── main.py + ├── process_query.py + ├── routers/ + │ ├── download.py + │ ├── dynamic.py + │ ├── index.py + │ ├── __init__.py + │ └── __pycache__/ + ├── server_utils.py + ├── static/ + │ ├── js/ + │ │ ├── snow.js + │ │ └── utils.js + │ └── robots.txt + ├── templates/ + │ ├── api.jinja + │ ├── base.jinja + │ ├── components/ + │ │ ├── footer.jinja + │ │ ├── github_form.jinja + │ │ ├── navbar.jinja + │ │ └── result.jinja + │ ├── github.jinja + │ └── index.jinja + ├── __init__.py + └── __pycache__/ + + +================================================ +File: /README.md +================================================ +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com/) + +![License](https://img.shields.io/badge/license-MIT-blue.svg) + +# GitIngest 🔍 +Turn any Git repository into a prompt-friendly text ingest for LLMs. + +You can also replace `hub` with `ingest` in any github url to access the coresponding digest + +[gitingest.com](https://gitingest.com/) + + +## 🚀 Features + +- **Easy code context**: Get a text digest from a git repository URL or a directory +- **Smart Formatting**: Optimized output format for LLM prompts +- **Statistics about**: : + - File and directory structure + - Size of the extract + - Token count +- **CLI tool**: Run it as a command (Currently on Linux only) +- **Python package**: Import it in your code + + +## 📦 Installation + +``` +pip install gitingest +``` + + +## 💡 Command Line usage + +The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. + +```bash +# Basic usage +gitingest /path/to/directory + +# From url +gitingest https://github.com/cyclotruc/gitingest + +# See more options +gitingest --help +``` + +This will write the digest in a text file (default `digest.txt`) in your current working directory. + + +## 🐛 Python package usage + + +```python +from gitingest import ingest + +summary, tree, content = ingest("path/to/directory") + +#or from URL +summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") +``` + +By default, this won't write a file but can be enabled with the `output` argument + + +## 🛠️ Using +- Tailwind CSS - Frontend +- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework +- [tiktoken](https://github.com/openai/tiktoken) - Token estimation +- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics + + +## 🌐 Self-host +1. Build the image: +``` +docker build -t gitingest . +``` + +2. Run the container: +``` +docker run -d --name gitingest -p 8000:8000 gitingest +``` +The application will be available at `http://localhost:8000` +Ensure environment variables are set before running the application or deploying it via Docker. + +## ✔️ Contributing + +Contributions are welcome! + +Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) + +### Ways to contribute + +1. Provide your feedback and ideas on discord +2. Open an Issue on github to report a bug +2. Create a Pull request + - Fork the repository + - Make your changes and test them locally + - Open a pull request for review and feedback + +### 🔧 Local dev + +#### Environment Configuration +- **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. +You can configure the application using the following environment variables: + +```bash +ALLOWED_HOSTS="gitingest.local,localhost" +``` + +#### Run locally +1. Clone the repository +```bash +git clone https://github.com/cyclotruc/gitingest.git +cd gitingest +``` + +2. Install dependencies +```bash +pip install -r requirements.txt +``` + +3. Run the application: +```bash +cd src +uvicorn main:app --reload +``` + +The frontend will be available at `localhost:8000` + + + + +================================================ +File: /.dockerignore +================================================ +# Git +.git +.gitignore + +# Python +__pycache__ +*.pyc +*.pyo +*.pyd +.Python +env +pip-log.txt +pip-delete-this-directory.txt +.tox +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.log + +# Virtual environment +venv +.env +.venv +ENV + +# IDE +.idea +.vscode +*.swp +*.swo + +# Project specific +docs/ +tests/ +*.md +LICENSE +pytest.ini +setup.py + + +================================================ +File: /.env +================================================ +ALLOWED_HOSTS="gitingest.local,localhost" + + +================================================ +File: /.git\config +================================================ +[core] + repositoryformatversion = 0 + filemode = false + bare = false + logallrefupdates = true + symlinks = false + ignorecase = true +[remote "origin"] + url = https://github.com/cyclotruc/gitingest.git + fetch = +refs/heads/*:refs/remotes/origin/* +[branch "main"] + remote = origin + merge = refs/heads/main + vscode-merge-base = origin/main + + +================================================ +File: /.git\description +================================================ +Unnamed repository; edit this file 'description' to name the repository. + + +================================================ +File: /.git\HEAD +================================================ +ref: refs/heads/main + + +================================================ +File: /.git\hooks\applypatch-msg.sample +================================================ +#!/bin/sh +# +# An example hook script to check the commit log message taken by +# applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. The hook is +# allowed to edit the commit message file. +# +# To enable this hook, rename this file to "applypatch-msg". + +. git-sh-setup +commitmsg="$(git rev-parse --git-path hooks/commit-msg)" +test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"} +: + + +================================================ +File: /.git\hooks\commit-msg.sample +================================================ +#!/bin/sh +# +# An example hook script to check the commit log message. +# Called by "git commit" with one argument, the name of the file +# that has the commit message. The hook should exit with non-zero +# status after issuing an appropriate message if it wants to stop the +# commit. The hook is allowed to edit the commit message file. +# +# To enable this hook, rename this file to "commit-msg". + +# Uncomment the below to add a Signed-off-by line to the message. +# Doing this in a hook is a bad idea in general, but the prepare-commit-msg +# hook is more suited to it. +# +# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" + +# This example catches duplicate Signed-off-by lines. + +test "" = "$(grep '^Signed-off-by: ' "$1" | + sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { + echo >&2 Duplicate Signed-off-by lines. + exit 1 +} + + +================================================ +File: /.git\hooks\fsmonitor-watchman.sample +================================================ +#!/usr/bin/perl + +use strict; +use warnings; +use IPC::Open2; + +# An example hook script to integrate Watchman +# (https://facebook.github.io/watchman/) with git to speed up detecting +# new and modified files. +# +# The hook is passed a version (currently 2) and last update token +# formatted as a string and outputs to stdout a new update token and +# all files that have been modified since the update token. Paths must +# be relative to the root of the working tree and separated by a single NUL. +# +# To enable this hook, rename this file to "query-watchman" and set +# 'git config core.fsmonitor .git/hooks/query-watchman' +# +my ($version, $last_update_token) = @ARGV; + +# Uncomment for debugging +# print STDERR "$0 $version $last_update_token\n"; + +# Check the hook interface version +if ($version ne 2) { + die "Unsupported query-fsmonitor hook version '$version'.\n" . + "Falling back to scanning...\n"; +} + +my $git_work_tree = get_working_dir(); + +my $retry = 1; + +my $json_pkg; +eval { + require JSON::XS; + $json_pkg = "JSON::XS"; + 1; +} or do { + require JSON::PP; + $json_pkg = "JSON::PP"; +}; + +launch_watchman(); + +sub launch_watchman { + my $o = watchman_query(); + if (is_work_tree_watched($o)) { + output_result($o->{clock}, @{$o->{files}}); + } +} + +sub output_result { + my ($clockid, @files) = @_; + + # Uncomment for debugging watchman output + # open (my $fh, ">", ".git/watchman-output.out"); + # binmode $fh, ":utf8"; + # print $fh "$clockid\n@files\n"; + # close $fh; + + binmode STDOUT, ":utf8"; + print $clockid; + print "\0"; + local $, = "\0"; + print @files; +} + +sub watchman_clock { + my $response = qx/watchman clock "$git_work_tree"/; + die "Failed to get clock id on '$git_work_tree'.\n" . + "Falling back to scanning...\n" if $? != 0; + + return $json_pkg->new->utf8->decode($response); +} + +sub watchman_query { + my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty') + or die "open2() failed: $!\n" . + "Falling back to scanning...\n"; + + # In the query expression below we're asking for names of files that + # changed since $last_update_token but not from the .git folder. + # + # To accomplish this, we're using the "since" generator to use the + # recency index to select candidate nodes and "fields" to limit the + # output to file names only. Then we're using the "expression" term to + # further constrain the results. + my $last_update_line = ""; + if (substr($last_update_token, 0, 1) eq "c") { + $last_update_token = "\"$last_update_token\""; + $last_update_line = qq[\n"since": $last_update_token,]; + } + my $query = <<" END"; + ["query", "$git_work_tree", {$last_update_line + "fields": ["name"], + "expression": ["not", ["dirname", ".git"]] + }] + END + + # Uncomment for debugging the watchman query + # open (my $fh, ">", ".git/watchman-query.json"); + # print $fh $query; + # close $fh; + + print CHLD_IN $query; + close CHLD_IN; + my $response = do {local $/; }; + + # Uncomment for debugging the watch response + # open ($fh, ">", ".git/watchman-response.json"); + # print $fh $response; + # close $fh; + + die "Watchman: command returned no output.\n" . + "Falling back to scanning...\n" if $response eq ""; + die "Watchman: command returned invalid output: $response\n" . + "Falling back to scanning...\n" unless $response =~ /^\{/; + + return $json_pkg->new->utf8->decode($response); +} + +sub is_work_tree_watched { + my ($output) = @_; + my $error = $output->{error}; + if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) { + $retry--; + my $response = qx/watchman watch "$git_work_tree"/; + die "Failed to make watchman watch '$git_work_tree'.\n" . + "Falling back to scanning...\n" if $? != 0; + $output = $json_pkg->new->utf8->decode($response); + $error = $output->{error}; + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + # Uncomment for debugging watchman output + # open (my $fh, ">", ".git/watchman-output.out"); + # close $fh; + + # Watchman will always return all files on the first query so + # return the fast "everything is dirty" flag to git and do the + # Watchman query just to get it over with now so we won't pay + # the cost in git to look up each individual file. + my $o = watchman_clock(); + $error = $output->{error}; + + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + output_result($o->{clock}, ("/")); + $last_update_token = $o->{clock}; + + eval { launch_watchman() }; + return 0; + } + + die "Watchman: $error.\n" . + "Falling back to scanning...\n" if $error; + + return 1; +} + +sub get_working_dir { + my $working_dir; + if ($^O =~ 'msys' || $^O =~ 'cygwin') { + $working_dir = Win32::GetCwd(); + $working_dir =~ tr/\\/\//; + } else { + require Cwd; + $working_dir = Cwd::cwd(); + } + + return $working_dir; +} + + +================================================ +File: /.git\hooks\post-update.sample +================================================ +#!/bin/sh +# +# An example hook script to prepare a packed repository for use over +# dumb transports. +# +# To enable this hook, rename this file to "post-update". + +exec git update-server-info + + +================================================ +File: /.git\hooks\pre-applypatch.sample +================================================ +#!/bin/sh +# +# An example hook script to verify what is about to be committed +# by applypatch from an e-mail message. +# +# The hook should exit with non-zero status after issuing an +# appropriate message if it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-applypatch". + +. git-sh-setup +precommit="$(git rev-parse --git-path hooks/pre-commit)" +test -x "$precommit" && exec "$precommit" ${1+"$@"} +: + + +================================================ +File: /.git\hooks\pre-commit.sample +================================================ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by "git commit" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message if +# it wants to stop the commit. +# +# To enable this hook, rename this file to "pre-commit". + +if git rev-parse --verify HEAD >/dev/null 2>&1 +then + against=HEAD +else + # Initial commit: diff against an empty tree object + against=$(git hash-object -t tree /dev/null) +fi + +# If you want to allow non-ASCII filenames set this variable to true. +allownonascii=$(git config --type=bool hooks.allownonascii) + +# Redirect output to stderr. +exec 1>&2 + +# Cross platform projects tend to avoid non-ASCII filenames; prevent +# them from being added to the repository. We exploit the fact that the +# printable range starts at the space character and ends with tilde. +if [ "$allownonascii" != "true" ] && + # Note that the use of brackets around a tr range is ok here, (it's + # even required, for portability to Solaris 10's /usr/bin/tr), since + # the square bracket bytes happen to fall in the designated range. + test $(git diff-index --cached --name-only --diff-filter=A -z $against | + LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0 +then + cat <<\EOF +Error: Attempt to add a non-ASCII file name. + +This can cause problems if you want to work with people on other platforms. + +To be portable it is advisable to rename the file. + +If you know what you are doing you can disable this check using: + + git config hooks.allownonascii true +EOF + exit 1 +fi + +# If there are whitespace errors, print the offending file names and fail. +exec git diff-index --check --cached $against -- + + +================================================ +File: /.git\hooks\pre-merge-commit.sample +================================================ +#!/bin/sh +# +# An example hook script to verify what is about to be committed. +# Called by "git merge" with no arguments. The hook should +# exit with non-zero status after issuing an appropriate message to +# stderr if it wants to stop the merge commit. +# +# To enable this hook, rename this file to "pre-merge-commit". + +. git-sh-setup +test -x "$GIT_DIR/hooks/pre-commit" && + exec "$GIT_DIR/hooks/pre-commit" +: + + +================================================ +File: /.git\hooks\pre-push.sample +================================================ +#!/bin/sh + +# An example hook script to verify what is about to be pushed. Called by "git +# push" after it has checked the remote status, but before anything has been +# pushed. If this script exits with a non-zero status nothing will be pushed. +# +# This hook is called with the following parameters: +# +# $1 -- Name of the remote to which the push is being done +# $2 -- URL to which the push is being done +# +# If pushing without using a named remote those arguments will be equal. +# +# Information about the commits which are being pushed is supplied as lines to +# the standard input in the form: +# +# +# +# This sample shows how to prevent push of commits where the log message starts +# with "WIP" (work in progress). + +remote="$1" +url="$2" + +zero=$(git hash-object --stdin &2 "Found WIP commit in $local_ref, not pushing" + exit 1 + fi + fi +done + +exit 0 + + +================================================ +File: /.git\hooks\pre-rebase.sample +================================================ +#!/bin/sh +# +# Copyright (c) 2006, 2008 Junio C Hamano +# +# The "pre-rebase" hook is run just before "git rebase" starts doing +# its job, and can prevent the command from running by exiting with +# non-zero status. +# +# The hook is called with the following parameters: +# +# $1 -- the upstream the series was forked from. +# $2 -- the branch being rebased (or empty when rebasing the current branch). +# +# This sample shows how to prevent topic branches that are already +# merged to 'next' branch from getting rebased, because allowing it +# would result in rebasing already published history. + +publish=next +basebranch="$1" +if test "$#" = 2 +then + topic="refs/heads/$2" +else + topic=`git symbolic-ref HEAD` || + exit 0 ;# we do not interrupt rebasing detached HEAD +fi + +case "$topic" in +refs/heads/??/*) + ;; +*) + exit 0 ;# we do not interrupt others. + ;; +esac + +# Now we are dealing with a topic branch being rebased +# on top of master. Is it OK to rebase it? + +# Does the topic really exist? +git show-ref -q "$topic" || { + echo >&2 "No such branch $topic" + exit 1 +} + +# Is topic fully merged to master? +not_in_master=`git rev-list --pretty=oneline ^master "$topic"` +if test -z "$not_in_master" +then + echo >&2 "$topic is fully merged to master; better remove it." + exit 1 ;# we could allow it, but there is no point. +fi + +# Is topic ever merged to next? If so you should not be rebasing it. +only_next_1=`git rev-list ^master "^$topic" ${publish} | sort` +only_next_2=`git rev-list ^master ${publish} | sort` +if test "$only_next_1" = "$only_next_2" +then + not_in_topic=`git rev-list "^$topic" master` + if test -z "$not_in_topic" + then + echo >&2 "$topic is already up to date with master" + exit 1 ;# we could allow it, but there is no point. + else + exit 0 + fi +else + not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` + /usr/bin/perl -e ' + my $topic = $ARGV[0]; + my $msg = "* $topic has commits already merged to public branch:\n"; + my (%not_in_next) = map { + /^([0-9a-f]+) /; + ($1 => 1); + } split(/\n/, $ARGV[1]); + for my $elem (map { + /^([0-9a-f]+) (.*)$/; + [$1 => $2]; + } split(/\n/, $ARGV[2])) { + if (!exists $not_in_next{$elem->[0]}) { + if ($msg) { + print STDERR $msg; + undef $msg; + } + print STDERR " $elem->[1]\n"; + } + } + ' "$topic" "$not_in_next" "$not_in_master" + exit 1 +fi + +<<\DOC_END + +This sample hook safeguards topic branches that have been +published from being rewound. + +The workflow assumed here is: + + * Once a topic branch forks from "master", "master" is never + merged into it again (either directly or indirectly). + + * Once a topic branch is fully cooked and merged into "master", + it is deleted. If you need to build on top of it to correct + earlier mistakes, a new topic branch is created by forking at + the tip of the "master". This is not strictly necessary, but + it makes it easier to keep your history simple. + + * Whenever you need to test or publish your changes to topic + branches, merge them into "next" branch. + +The script, being an example, hardcodes the publish branch name +to be "next", but it is trivial to make it configurable via +$GIT_DIR/config mechanism. + +With this workflow, you would want to know: + +(1) ... if a topic branch has ever been merged to "next". Young + topic branches can have stupid mistakes you would rather + clean up before publishing, and things that have not been + merged into other branches can be easily rebased without + affecting other people. But once it is published, you would + not want to rewind it. + +(2) ... if a topic branch has been fully merged to "master". + Then you can delete it. More importantly, you should not + build on top of it -- other people may already want to + change things related to the topic as patches against your + "master", so if you need further changes, it is better to + fork the topic (perhaps with the same name) afresh from the + tip of "master". + +Let's look at this example: + + o---o---o---o---o---o---o---o---o---o "next" + / / / / + / a---a---b A / / + / / / / + / / c---c---c---c B / + / / / \ / + / / / b---b C \ / + / / / / \ / + ---o---o---o---o---o---o---o---o---o---o---o "master" + + +A, B and C are topic branches. + + * A has one fix since it was merged up to "next". + + * B has finished. It has been fully merged up to "master" and "next", + and is ready to be deleted. + + * C has not merged to "next" at all. + +We would want to allow C to be rebased, refuse A, and encourage +B to be deleted. + +To compute (1): + + git rev-list ^master ^topic next + git rev-list ^master next + + if these match, topic has not merged in next at all. + +To compute (2): + + git rev-list master..topic + + if this is empty, it is fully merged to "master". + +DOC_END + + +================================================ +File: /.git\hooks\pre-receive.sample +================================================ +#!/bin/sh +# +# An example hook script to make use of push options. +# The example simply echoes all push options that start with 'echoback=' +# and rejects all pushes when the "reject" push option is used. +# +# To enable this hook, rename this file to "pre-receive". + +if test -n "$GIT_PUSH_OPTION_COUNT" +then + i=0 + while test "$i" -lt "$GIT_PUSH_OPTION_COUNT" + do + eval "value=\$GIT_PUSH_OPTION_$i" + case "$value" in + echoback=*) + echo "echo from the pre-receive-hook: ${value#*=}" >&2 + ;; + reject) + exit 1 + esac + i=$((i + 1)) + done +fi + + +================================================ +File: /.git\hooks\prepare-commit-msg.sample +================================================ +#!/bin/sh +# +# An example hook script to prepare the commit log message. +# Called by "git commit" with the name of the file that has the +# commit message, followed by the description of the commit +# message's source. The hook's purpose is to edit the commit +# message file. If the hook fails with a non-zero status, +# the commit is aborted. +# +# To enable this hook, rename this file to "prepare-commit-msg". + +# This hook includes three examples. The first one removes the +# "# Please enter the commit message..." help message. +# +# The second includes the output of "git diff --name-status -r" +# into the message, just before the "git status" output. It is +# commented because it doesn't cope with --amend or with squashed +# commits. +# +# The third example adds a Signed-off-by line to the message, that can +# still be edited. This is rarely a good idea. + +COMMIT_MSG_FILE=$1 +COMMIT_SOURCE=$2 +SHA1=$3 + +/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE" + +# case "$COMMIT_SOURCE,$SHA1" in +# ,|template,) +# /usr/bin/perl -i.bak -pe ' +# print "\n" . `git diff --cached --name-status -r` +# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;; +# *) ;; +# esac + +# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') +# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE" +# if test -z "$COMMIT_SOURCE" +# then +# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE" +# fi + + +================================================ +File: /.git\hooks\push-to-checkout.sample +================================================ +#!/bin/sh + +# An example hook script to update a checked-out tree on a git push. +# +# This hook is invoked by git-receive-pack(1) when it reacts to git +# push and updates reference(s) in its repository, and when the push +# tries to update the branch that is currently checked out and the +# receive.denyCurrentBranch configuration variable is set to +# updateInstead. +# +# By default, such a push is refused if the working tree and the index +# of the remote repository has any difference from the currently +# checked out commit; when both the working tree and the index match +# the current commit, they are updated to match the newly pushed tip +# of the branch. This hook is to be used to override the default +# behaviour; however the code below reimplements the default behaviour +# as a starting point for convenient modification. +# +# The hook receives the commit with which the tip of the current +# branch is going to be updated: +commit=$1 + +# It can exit with a non-zero status to refuse the push (when it does +# so, it must not modify the index or the working tree). +die () { + echo >&2 "$*" + exit 1 +} + +# Or it can make any necessary changes to the working tree and to the +# index to bring them to the desired state when the tip of the current +# branch is updated to the new commit, and exit with a zero status. +# +# For example, the hook can simply run git read-tree -u -m HEAD "$1" +# in order to emulate git fetch that is run in the reverse direction +# with git push, as the two-tree form of git read-tree -u -m is +# essentially the same as git switch or git checkout that switches +# branches while keeping the local changes in the working tree that do +# not interfere with the difference between the branches. + +# The below is a more-or-less exact translation to shell of the C code +# for the default behaviour for git's push-to-checkout hook defined in +# the push_to_deploy() function in builtin/receive-pack.c. +# +# Note that the hook will be executed from the repository directory, +# not from the working tree, so if you want to perform operations on +# the working tree, you will have to adapt your code accordingly, e.g. +# by adding "cd .." or using relative paths. + +if ! git update-index -q --ignore-submodules --refresh +then + die "Up-to-date check failed" +fi + +if ! git diff-files --quiet --ignore-submodules -- +then + die "Working directory has unstaged changes" +fi + +# This is a rough translation of: +# +# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX +if git cat-file -e HEAD 2>/dev/null +then + head=HEAD +else + head=$(git hash-object -t tree --stdin &2 + exit 1 +} + +unset GIT_DIR GIT_WORK_TREE +cd "$worktree" && + +if grep -q "^diff --git " "$1" +then + validate_patch "$1" +else + validate_cover_letter "$1" +fi && + +if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL" +then + git config --unset-all sendemail.validateWorktree && + trap 'git worktree remove -ff "$worktree"' EXIT && + validate_series +fi + + +================================================ +File: /.git\hooks\update.sample +================================================ +#!/bin/sh +# +# An example hook script to block unannotated tags from entering. +# Called by "git receive-pack" with arguments: refname sha1-old sha1-new +# +# To enable this hook, rename this file to "update". +# +# Config +# ------ +# hooks.allowunannotated +# This boolean sets whether unannotated tags will be allowed into the +# repository. By default they won't be. +# hooks.allowdeletetag +# This boolean sets whether deleting tags will be allowed in the +# repository. By default they won't be. +# hooks.allowmodifytag +# This boolean sets whether a tag may be modified after creation. By default +# it won't be. +# hooks.allowdeletebranch +# This boolean sets whether deleting branches will be allowed in the +# repository. By default they won't be. +# hooks.denycreatebranch +# This boolean sets whether remotely creating branches will be denied +# in the repository. By default this is allowed. +# + +# --- Command line +refname="$1" +oldrev="$2" +newrev="$3" + +# --- Safety check +if [ -z "$GIT_DIR" ]; then + echo "Don't run this script from the command line." >&2 + echo " (if you want, you could supply GIT_DIR then run" >&2 + echo " $0 )" >&2 + exit 1 +fi + +if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then + echo "usage: $0 " >&2 + exit 1 +fi + +# --- Config +allowunannotated=$(git config --type=bool hooks.allowunannotated) +allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch) +denycreatebranch=$(git config --type=bool hooks.denycreatebranch) +allowdeletetag=$(git config --type=bool hooks.allowdeletetag) +allowmodifytag=$(git config --type=bool hooks.allowmodifytag) + +# check for no description +projectdesc=$(sed -e '1q' "$GIT_DIR/description") +case "$projectdesc" in +"Unnamed repository"* | "") + echo "*** Project description file hasn't been set" >&2 + exit 1 + ;; +esac + +# --- Check types +# if $newrev is 0000...0000, it's a commit to delete a ref. +zero=$(git hash-object --stdin &2 + echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 + exit 1 + fi + ;; + refs/tags/*,delete) + # delete tag + if [ "$allowdeletetag" != "true" ]; then + echo "*** Deleting a tag is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/tags/*,tag) + # annotated tag + if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 + then + echo "*** Tag '$refname' already exists." >&2 + echo "*** Modifying a tag is not allowed in this repository." >&2 + exit 1 + fi + ;; + refs/heads/*,commit) + # branch + if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then + echo "*** Creating a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/heads/*,delete) + # delete branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + refs/remotes/*,commit) + # tracking branch + ;; + refs/remotes/*,delete) + # delete tracking branch + if [ "$allowdeletebranch" != "true" ]; then + echo "*** Deleting a tracking branch is not allowed in this repository" >&2 + exit 1 + fi + ;; + *) + # Anything else (is there anything else?) + echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 + exit 1 + ;; +esac + +# --- Finished +exit 0 + + +================================================ +File: /.git\info\exclude +================================================ +# git ls-files --others --exclude-from=.git/info/exclude +# Lines that start with '#' are comments. +# For a project mostly in C, the following would be a good set of +# exclude patterns (uncomment them if you want to use them): +# *.[oa] +# *~ + + +================================================ +File: /.git\logs\HEAD +================================================ +0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git + + +================================================ +File: /.git\logs\refs\heads\main +================================================ +0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git + + +================================================ +File: /.git\logs\refs\remotes\origin\HEAD +================================================ +0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git + + +================================================ +File: /.git\packed-refs +================================================ +# pack-refs with: peeled fully-peeled sorted +38f59ba682df7942661d0907e68d38eab52e2d74 refs/remotes/origin/main + + +================================================ +File: /.git\refs\heads\main +================================================ +38f59ba682df7942661d0907e68d38eab52e2d74 + + +================================================ +File: /.git\refs\remotes\origin\HEAD +================================================ +ref: refs/remotes/origin/main + + +================================================ +File: /.github\workflows\unitest.yml +================================================ +name: Unit Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-asyncio + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install -e . + + - name: Run tests + run: | + pytest + +================================================ +File: /.gitignore +================================================ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +tmp/* + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +.vscode/settings.json +.DS_Store + +# Project specific +history.txt +cleanup.py +Caddyfile + +# ignore default output directory +tmp/* + + +================================================ +File: /CODE_OF_CONDUCT.md +================================================ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +romain@coderamp.io. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. + + +================================================ +File: /Dockerfile +================================================ +# Build stage +FROM python:3.12-slim AS builder + +WORKDIR /build + +# Copy requirements first to leverage Docker cache +COPY requirements.txt . + +# Install build dependencies and Python packages +RUN apt-get update \ + && apt-get install -y --no-install-recommends gcc python3-dev \ + && pip install --no-cache-dir --upgrade pip \ + && pip install --no-cache-dir --timeout 1000 -r requirements.txt \ + && rm -rf /var/lib/apt/lists/* + +# Runtime stage +FROM python:3.12-slim + +# Set Python environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 + +# Install git +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Create a non-root user +RUN useradd -m -u 1000 appuser + +COPY --from=builder /usr/local/lib/python3.12/site-packages/ /usr/local/lib/python3.12/site-packages/ +COPY src/ ./ + +# Change ownership of the application files +RUN chown -R appuser:appuser /app + +# Switch to non-root user +USER appuser + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] + + +================================================ +File: /LICENSE +================================================ +MIT License + +Copyright (c) 2024 Romain Courtois + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +================================================ +File: /pytest.ini +================================================ +[pytest] +pythonpath = src +testpaths = src/gitingest/tests +asyncio_mode = auto + + +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +================================================ +File: /requirements.txt +================================================ +fastapi[standard] +uvicorn +fastapi-analytics +slowapi +tiktoken +pytest +pytest-asyncio +click>=8.0.0 + + +================================================ +File: /SECURITY.md +================================================ +# Security Policy + +## Reporting a Vulnerability + +If you have discovered a vulnerability inside the project, report it privately at romain@coderamp.io. This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. + + +================================================ +File: /setup.py +================================================ +from setuptools import setup, find_packages + +setup( + name="gitingest", + version="0.1.2", + packages=find_packages(where="src"), + package_dir={"": "src"}, + include_package_data=True, + install_requires=[ + "click>=8.0.0", + "tiktoken", + ], + entry_points={ + "console_scripts": [ + "gitingest=gitingest.cli:main", + ], + }, + python_requires=">=3.6", + author="Romain Courtois", + author_email="romain@coderamp.io", + description="CLI tool to analyze and create text dumps of codebases for LLMs", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/cyclotruc/gitingest", + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + ], +) + +================================================ +File: /src\config.py +================================================ +MAX_DISPLAY_SIZE = 300000 +TMP_BASE_PATH = "../tmp" + +EXAMPLE_REPOS = [ + {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, + {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, + {"name": "Flask", "url": "https://github.com/pallets/flask"}, + {"name": "Tldraw", "url": "https://github.com/tldraw/tldraw"}, + {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, +] + + +================================================ +File: /src\gitingest\cli.py +================================================ +import os +import pathlib +import click +import sys +from .encoding import setup_encoding + +# Setup encoding first +setup_encoding() + +# Define constants +MAX_FILE_SIZE = 51200 # 50KB by default +DEFAULT_IGNORE_PATTERNS = [] + +def normalize_pattern(pattern: str) -> str: + pattern = pattern.strip() + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + +@click.command() +@click.argument('source', type=str, required=True) +@click.option('--output', '-o', default=None, + help='Output file path (default: .txt in current directory)') +@click.option('--max-size', '-s', default=MAX_FILE_SIZE, + help='Maximum file size to process in bytes') +@click.option('--exclude-pattern', '-e', multiple=True, + help='Patterns to exclude') +@click.option('--include-pattern', '-i', multiple=True, + help='Patterns to include') +def main(source, output, max_size, exclude_pattern, include_pattern): + """Analyze a directory and create a text dump of its contents.""" + try: + from gitingest.ingest import ingest + + # Convert paths to absolute with proper encoding + source = str(pathlib.Path(source).resolve()) + + # Handle patterns + exclude_patterns = list(exclude_pattern) + include_patterns = list(set(include_pattern)) + + # Set default output name + if not output: + output = "digest.txt" + output = str(pathlib.Path(output).resolve()) + + # Call ingest with encoding awareness + summary, tree, content = ingest( + source, + max_size, + include_patterns, + exclude_patterns, + output=output + ) + + # Write output with explicit encoding + with open(output, 'w', encoding='utf-8', errors='replace') as f: + if isinstance(summary, bytes): + summary = summary.decode('utf-8', errors='replace') + if isinstance(tree, bytes): + tree = tree.decode('utf-8', errors='replace') + if isinstance(content, bytes): + content = content.decode('utf-8', errors='replace') + + f.write(f"{summary}\n\n{tree}\n\n{content}") + + # Print messages with encoding handling + click.echo(f"Analysis complete! Output written to: {output}") + click.echo("\nSummary:") + click.echo(summary) + + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + raise click.Abort() + +if __name__ == '__main__': + main() + +================================================ +File: /src\gitingest\clone.py +================================================ +import asyncio +from typing import Tuple + +from gitingest.utils import async_timeout + +CLONE_TIMEOUT = 20 + +async def check_repo_exists(url: str) -> bool: + proc = await asyncio.create_subprocess_exec( + "curl", + "-I", + url, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + return False + # Check if stdout contains "404" status code + stdout_str = stdout.decode() + return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + +@async_timeout(CLONE_TIMEOUT) +async def clone_repo(query: dict) -> str: + if not await check_repo_exists(query['url']): + raise ValueError("Repository not found, make sure it is public") + + if query['commit']: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--single-branch", + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + + proc = await asyncio.create_subprocess_exec( + "git", + "-C", + query['local_path'], + "checkout", + query['branch'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth=1", + "--single-branch", + "--branch", + query['branch'], + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + else: + proc = await asyncio.create_subprocess_exec( + "git", + "clone", + "--depth=1", + "--single-branch", + query['url'], + query['local_path'], + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await proc.communicate() + + return stdout, stderr + +================================================ +File: /src\gitingest\encoding.py +================================================ +import sys +import io +import codecs + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, + encoding='utf-8', + errors='replace' + ) + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, + encoding='utf-8', + errors='replace' + ) + +================================================ +File: /src\gitingest\ingest.py +================================================ +import asyncio +import shutil +from typing import Union, List +from pathlib import Path +import io +import sys + +# Import other modules from the package +from gitingest.parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + +def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str] = None, + exclude_patterns: Union[List[str], str] = None, + output: str = None) -> tuple[str, str, str]: + """ + Analyze and create a text dump of source contents. + + Args: + source: Path to source directory or git URL + max_file_size: Maximum file size to process in bytes + include_patterns: Patterns to include in analysis + exclude_patterns: Patterns to exclude from analysis + output: Output file path + + Returns: + Tuple of (summary, tree, content) + """ + setup_encoding() + query = None + + try: + query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) + if query['url']: + asyncio.run(clone_repo(query)) + + summary, tree, content = ingest_from_query(query) + + if output: + # Write with explicit UTF-8 encoding + with open(output, "w", encoding='utf-8', errors='replace') as f: + # Ensure all content is properly encoded + tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree + content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content + f.write(f"{tree}\n{content}") + + return summary, tree, content + + except UnicodeEncodeError as e: + # Handle encoding errors specifically + error_msg = f"Encoding error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + except Exception as e: + # Handle other errors + error_msg = f"Error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + finally: + # Clean up the temporary directory if it was created + if query and query.get('url'): + # Get parent directory two levels up from local_path (../tmp) + cleanup_path = str(Path(query['local_path']).parents[1]) + try: + shutil.rmtree(cleanup_path, ignore_errors=True) + except Exception as e: + print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr) + +================================================ +File: /src\gitingest\ingest_from_query.py +================================================ +import os +from fnmatch import fnmatch +from typing import Dict, List, Union +import tiktoken + + +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB +MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal +MAX_FILES = 10000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500MB + + +def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: + rel_path = path.replace(base_path, "").lstrip(os.sep) + include = False + for pattern in include_patterns: + if fnmatch(rel_path, pattern): + include = True + return include + +def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: + rel_path = path.replace(base_path, "").lstrip(os.sep) + for pattern in ignore_patterns: + if pattern == '': + continue + if fnmatch(rel_path, pattern): + return True + return False + +def is_safe_symlink(symlink_path: str, base_path: str) -> bool: + """Check if a symlink points to a location within the base directory.""" + try: + target_path = os.path.realpath(symlink_path) + base_path = os.path.realpath(base_path) + return os.path.commonpath([target_path, base_path]) == base_path + except (OSError, ValueError): + # If there's any error resolving the paths, consider it unsafe + return False + +def is_text_file(file_path: str) -> bool: + """Determines if a file is likely a text file based on its content.""" + try: + with open(file_path, 'rb') as file: + chunk = file.read(1024) + return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) + except IOError: + return False + +def read_file_content(file_path: str) -> str: + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + return f.read() + except Exception as e: + return f"Error reading file: {str(e)}" + +def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict: + """Recursively analyzes a directory and its contents with safety limits.""" + if seen_paths is None: + seen_paths = set() + if stats is None: + stats = {"total_files": 0, "total_size": 0} + + if depth > MAX_DIRECTORY_DEPTH: + print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)") + return None + + if stats["total_files"] >= MAX_FILES: + print(f"Skipping further processing: maximum file limit ({MAX_FILES}) reached") + return None + + if stats["total_size"] >= MAX_TOTAL_SIZE_BYTES: + print(f"Skipping further processing: maximum total size ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") + return None + + real_path = os.path.realpath(path) + if real_path in seen_paths: + print(f"Skipping already visited path: {path}") + return None + seen_paths.add(real_path) + + result = { + "name": os.path.basename(path), + "type": "directory", + "size": 0, + "children": [], + "file_count": 0, + "dir_count": 0, + "path": path, + "ignore_content": False + } + + ignore_patterns = query['ignore_patterns'] + base_path = query['local_path'] + include_patterns = query['include_patterns'] + + try: + for item in os.listdir(path): + item_path = os.path.join(path, item) + + if should_exclude(item_path, base_path, ignore_patterns): + continue + + is_file = os.path.isfile(item_path) + if is_file and query['include_patterns']: + if not should_include(item_path, base_path, include_patterns): + result["ignore_content"] = True + continue + + # Handle symlinks + if os.path.islink(item_path): + if not is_safe_symlink(item_path, base_path): + print(f"Skipping symlink that points outside base directory: {item_path}") + continue + real_path = os.path.realpath(item_path) + if real_path in seen_paths: + print(f"Skipping already visited symlink target: {item_path}") + continue + + if os.path.isfile(real_path): + file_size = os.path.getsize(real_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + continue + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return result + + is_text = is_text_file(real_path) + content = read_file_content(real_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(real_path): + subdir = scan_directory(real_path, query, seen_paths, depth + 1, stats) + if subdir and (not include_patterns or subdir["file_count"] > 0): + subdir["name"] = item + subdir["path"] = item_path + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + continue + + if os.path.isfile(item_path): + file_size = os.path.getsize(item_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + continue + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + return result + + is_text = is_text_file(item_path) + content = read_file_content(item_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(item_path): + subdir = scan_directory(item_path, query, seen_paths, depth + 1, stats) + if subdir and (not include_patterns or subdir["file_count"] > 0): + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + + except PermissionError: + print(f"Permission denied: {path}") + + return result + +def extract_files_content(query: dict, node: Dict, max_file_size: int, files: List = None) -> List[Dict]: + """Recursively collects all text files with their contents.""" + if files is None: + files = [] + + if node["type"] == "file" and node["content"] != "[Non-text file]": + content = node["content"] + if node["size"] > max_file_size: + content = None + + files.append({ + "path": node["path"].replace(query['local_path'], ""), + "content": content, + "size": node["size"] + }) + elif node["type"] == "directory": + for child in node["children"]: + extract_files_content(query, child, max_file_size, files) + return files + +def create_file_content_string(files: List[Dict]) -> str: + """Creates a formatted string of file contents with separators.""" + output = "" + separator = "=" * 48 + "\n" + + # First add README.md if it exists + for file in files: + if not file['content']: + continue + if file['path'].lower() == '/readme.md': + output += separator + output += f"File: {file['path']}\n" + output += separator + output += f"{file['content']}\n\n" + break + + # Then add all other files in their original order + for file in files: + if not file['content'] or file['path'].lower() == '/readme.md': + continue + output += separator + output += f"File: {file['path']}\n" + output += separator + output += f"{file['content']}\n\n" + + return output + +def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: + """Creates a summary string with file counts and content size.""" + if "user_name" in query: + summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" + else: + summary = f"Repository: {query['slug']}\n" + summary += f"Files analyzed: {nodes['file_count']}\n" + + if 'subpath' in query and query['subpath'] != '/': + summary += f"Subpath: {query['subpath']}\n" + if 'commit' in query and query['commit']: + summary += f"Commit: {query['commit']}\n" + elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + summary += f"Branch: {query['branch']}\n" + return summary + +def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bool = True) -> str: + """Creates a tree-like string representation of the file structure.""" + tree = "" + if not node["name"]: + node["name"] = query['slug'] + + if node["name"]: + current_prefix = "└── " if is_last else "├── " + name = node["name"] + "/" if node["type"] == "directory" else node["name"] + tree += prefix + current_prefix + name + "\n" + if node["type"] == "directory": + # Adjust prefix only if we added a node name + new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix + children = node["children"] + for i, child in enumerate(children): + tree += create_tree_structure(query, child, new_prefix, i == len(children) - 1) + + return tree + +def generate_token_string(context_string: str) -> str: + """Returns the number of tokens in a text string.""" + formatted_tokens = "" + try: + encoding = tiktoken.get_encoding("cl100k_base", ) + total_tokens = len(encoding.encode(context_string, disallowed_special=())) + + except Exception as e: + print(e) + return None + if total_tokens > 1000000: + formatted_tokens = f"{total_tokens/1000000:.1f}M" + elif total_tokens > 1000: + formatted_tokens = f"{total_tokens/1000:.1f}k" + else: + formatted_tokens = f"{total_tokens}" + return formatted_tokens + +def ingest_single_file(path: str, query: dict) -> Dict: + if not os.path.isfile(path): + raise ValueError(f"Path {path} is not a file") + + file_size = os.path.getsize(path) + is_text = is_text_file(path) + if not is_text: + raise ValueError(f"File {path} is not a text file") + + content = read_file_content(path) + if file_size > query['max_file_size']: + content = "[Content ignored: file too large]" + + file_info = { + "path": path.replace(query['local_path'], ""), + "content": content, + "size": file_size + } + + summary = ( + f"Repository: {query['user_name']}/{query['repo_name']}\n" + f"File: {os.path.basename(path)}\n" + f"Size: {file_size:,} bytes\n" + f"Lines: {len(content.splitlines()):,}\n" + ) + + files_content = create_file_content_string([file_info]) + tree = "Directory structure:\n└── " + os.path.basename(path) + + formatted_tokens = generate_token_string(files_content) + if formatted_tokens: + summary += f"\nEstimated tokens: {formatted_tokens}" + return (summary, tree, files_content) + +def ingest_directory(path: str, query: dict) -> Dict: + nodes = scan_directory(path, query) + files = extract_files_content(query, nodes, query['max_file_size']) + summary = create_summary_string(query, nodes, files) + tree = "Directory structure:\n" + create_tree_structure(query, nodes) + files_content = create_file_content_string(files) + + formatted_tokens = generate_token_string(tree + files_content) + if formatted_tokens: + summary += f"\nEstimated tokens: {formatted_tokens}" + return (summary, tree, files_content) + +def ingest_from_query(query: dict) -> Dict: + """Main entry point for analyzing a codebase directory or single file.""" + path = f"{query['local_path']}{query['subpath']}" + if not os.path.exists(path): + raise ValueError(f"{query['slug']} cannot be found") + + if query.get('type') == 'blob': + return ingest_single_file(path, query) + else: + return ingest_directory(path, query) + + + +================================================ +File: /src\gitingest\parse_query.py +================================================ +from typing import List, Union +import uuid +import os + + +DEFAULT_IGNORE_PATTERNS = [ + # Python + '*.pyc', '*.pyo', '*.pyd', '__pycache__', '.pytest_cache', '.coverage', + '.tox', '.nox', '.mypy_cache', '.ruff_cache', '.hypothesis', + 'poetry.lock', 'Pipfile.lock', + + # JavaScript/Node + 'node_modules', 'bower_components', 'package-lock.json', 'yarn.lock', + '.npm', '.yarn', '.pnpm-store', + + # Version control + '.git', '.svn', '.hg', '.gitignore', '.gitattributes', '.gitmodules', + + # Images and media + '*.svg', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.pdf', + '*.mov', '*.mp4', '*.mp3', '*.wav', + + # Virtual environments + 'venv', '.venv', 'env', '.env', 'virtualenv', + + # IDEs and editors + '.idea', '.vscode', '.vs', '*.swp', '*.swo', '*.swn', + '.settings', '.project', '.classpath', '*.sublime-*', + + # Temporary and cache files + '*.log', '*.bak', '*.swp', '*.tmp', '*.temp', + '.cache', '.sass-cache', '.eslintcache', + '.DS_Store', 'Thumbs.db', 'desktop.ini', + + # Build directories and artifacts + 'build', 'dist', 'target', 'out', + '*.egg-info', '*.egg', '*.whl', + '*.so', '*.dylib', '*.dll', '*.class', + + # Documentation + 'site-packages', '.docusaurus', '.next', '.nuxt', + + # Other common patterns + '*.min.js', '*.min.css', # Minified files + '*.map', # Source maps + '.terraform', '*.tfstate*', # Terraform + 'vendor/', # Dependencies in various languages +] + +TMP_BASE_PATH = "../tmp" + +def parse_url(url: str) -> dict: + parsed = { + "user_name": None, + "repo_name": None, + "type": None, + "branch": None, + "commit": None, + "subpath": "/", + "local_path": None, + "url": None, + "slug": None, + "id": None, + } + + url = url.split(" ")[0] + if not url.startswith('https://'): + url = 'https://' + url + + # Extract domain and path + url_parts = url.split('/') + domain = url_parts[2] + path_parts = url_parts[3:] + + if len(path_parts) < 2: + raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") + + parsed["user_name"] = path_parts[0] + parsed["repo_name"] = path_parts[1] + + # Keep original URL format + parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" + parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" + parsed["id"] = str(uuid.uuid4()) + parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" + + if len(path_parts) > 3: + parsed["type"] = path_parts[2] + parsed["branch"] = path_parts[3] + if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): + parsed["commit"] = parsed['branch'] + + parsed["subpath"] = "/" + "/".join(path_parts[4:]) + return parsed + +def normalize_pattern(pattern: str) -> str: + pattern = pattern.strip() + pattern = pattern.lstrip(os.sep) + if pattern.endswith(os.sep): + pattern += "*" + return pattern + +def parse_patterns(pattern: Union[List[str], str]) -> List[str]: + if isinstance(pattern, list): + pattern = ",".join(pattern) + + for p in pattern.split(","): + if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): + raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") + patterns = [normalize_pattern(p) for p in pattern.split(",")] + return patterns + +def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: + for pattern in include_patterns: + if pattern in ignore_patterns: + ignore_patterns.remove(pattern) + return ignore_patterns + + +def parse_path(path: str) -> dict: + + query = { + "local_path": os.path.abspath(path), + "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), + "subpath": "/", + "id": str(uuid.uuid4()), + "url": None, + } + return query + +def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: + if from_web: + query = parse_url(source) + else: + if source.startswith("https://") or "github.com" in source: + query = parse_url(source) + else: + query = parse_path(source) + query['max_file_size'] = max_file_size + + if ignore_patterns and ignore_patterns != "": + ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) + else: + ignore_patterns = DEFAULT_IGNORE_PATTERNS + + if include_patterns and include_patterns != "": + include_patterns = parse_patterns(include_patterns) + ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) + else: + include_patterns = None + + query['ignore_patterns'] = ignore_patterns + query['include_patterns'] = include_patterns + + return query + + + +================================================ +File: /src\gitingest\tests\conftest.py +================================================ +import os +import sys + +# Get the absolute path of the project root directory (one level up from tests) +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +# Add both the project root and src directory to PYTHONPATH +sys.path.insert(0, project_root) +sys.path.insert(0, os.path.join(project_root, 'src')) + +================================================ +File: /src\gitingest\tests\test_clone.py +================================================ +import pytest +from clone import clone_repo, check_repo_exists +from unittest.mock import patch, AsyncMock + +@pytest.mark.asyncio +async def test_clone_repo_with_commit(): + query = { + 'commit': 'a' * 40, # Simulating a valid commit hash + 'branch': 'main', + 'url': 'https://github.com/user/repo', + 'local_path': '/tmp/repo' + } + + with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'output', b'error') + mock_exec.return_value = mock_process + + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + assert mock_exec.call_count == 2 # Clone and checkout calls + +@pytest.mark.asyncio +async def test_clone_repo_without_commit(): + query = { + 'commit': None, + 'branch': 'main', + 'url': 'https://github.com/user/repo', + 'local_path': '/tmp/repo' + } + + with patch('clone.check_repo_exists', return_value=True) as mock_check: + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'output', b'error') + mock_exec.return_value = mock_process + + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + assert mock_exec.call_count == 1 # Only clone call + +@pytest.mark.asyncio +async def test_clone_repo_nonexistent_repository(): + query = { + 'commit': None, + 'branch': 'main', + 'url': 'https://github.com/user/nonexistent-repo', + 'local_path': '/tmp/repo' + } + + with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: + with pytest.raises(ValueError, match="Repository not found"): + await clone_repo(query) + mock_check.assert_called_once_with(query['url']) + +@pytest.mark.asyncio +async def test_check_repo_exists(): + url = "https://github.com/user/repo" + + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') + mock_exec.return_value = mock_process + + # Test existing repository + mock_process.returncode = 0 + assert await check_repo_exists(url) is True + + # Test non-existing repository (404 response) + mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') + mock_process.returncode = 0 + assert await check_repo_exists(url) is False + + # Test failed request + mock_process.returncode = 1 + assert await check_repo_exists(url) is False + +================================================ +File: /src\gitingest\tests\test_ingest.py +================================================ +import pytest +from src.gitingest.ingest_from_query import ( + scan_directory, + extract_files_content, +) + +# Test fixtures +@pytest.fixture +def sample_query(): + return { + 'user_name': 'test_user', + 'repo_name': 'test_repo', + 'local_path': '/tmp/test_repo', + 'subpath': '/', + 'branch': 'main', + 'commit': None, + 'max_file_size': 1000000, + 'slug': 'test_user/test_repo', + 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], + 'include_patterns': None, + 'pattern_type': 'exclude' + + } + +@pytest.fixture +def temp_directory(tmp_path): + # Creates the following structure: + # test_repo/ + # ├── file1.txt + # ├── file2.py + # └── src/ + # | ├── subfile1.txt + # | └── subfile2.py + # | └── subdir/ + # | └── file_subdir.txt + # | └── file_subdir.py + # └── dir1/ + # | └── file_dir1.txt + # └── dir2/ + # └── file_dir2.txt + + test_dir = tmp_path / "test_repo" + test_dir.mkdir() + + # Root files + (test_dir / "file1.txt").write_text("Hello World") + (test_dir / "file2.py").write_text("print('Hello')") + + # src directory and its files + src_dir = test_dir / "src" + src_dir.mkdir() + (src_dir / "subfile1.txt").write_text("Hello from src") + (src_dir / "subfile2.py").write_text("print('Hello from src')") + + # src/subdir and its files + subdir = src_dir / "subdir" + subdir.mkdir() + (subdir / "file_subdir.txt").write_text("Hello from subdir") + (subdir / "file_subdir.py").write_text("print('Hello from subdir')") + + # dir1 and its file + dir1 = test_dir / "dir1" + dir1.mkdir() + (dir1 / "file_dir1.txt").write_text("Hello from dir1") + + # dir2 and its file + dir2 = test_dir / "dir2" + dir2.mkdir() + (dir2 / "file_dir2.txt").write_text("Hello from dir2") + + return test_dir + +def test_scan_directory(temp_directory, sample_query): + result = scan_directory( + str(temp_directory), + query=sample_query + ) + + assert result['type'] == 'directory' + assert result['file_count'] == 8 # All .txt and .py files + assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 + assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 + +def test_extract_files_content(temp_directory, sample_query): + nodes = scan_directory( + str(temp_directory), + query=sample_query + ) + + files = extract_files_content(sample_query, nodes, max_file_size=1000000) + assert len(files) == 8 # All .txt and .py files + + # Check for presence of key files + paths = [f['path'] for f in files] + assert any('file1.txt' in p for p in paths) + assert any('subfile1.txt' in p for p in paths) + assert any('file2.py' in p for p in paths) + assert any('subfile2.py' in p for p in paths) + assert any('file_subdir.txt' in p for p in paths) + assert any('file_dir1.txt' in p for p in paths) + assert any('file_dir2.txt' in p for p in paths) + + + +# TODO: test with include patterns: ['*.txt'] +# TODO: test with wrong include patterns: ['*.qwerty'] + + +#single folder patterns +# TODO: test with include patterns: ['src/*'] +# TODO: test with include patterns: ['/src/*'] +# TODO: test with include patterns: ['/src/'] +# TODO: test with include patterns: ['/src*'] + +#multiple patterns +# TODO: test with multiple include patterns: ['*.txt', '*.py'] +# TODO: test with multiple include patterns: ['/src/*', '*.txt'] +# TODO: test with multiple include patterns: ['/src*', '*.txt'] + + + + + + +================================================ +File: /src\gitingest\tests\test_parse_query.py +================================================ +import pytest +from gitingest.parse_query import parse_query, parse_url, DEFAULT_IGNORE_PATTERNS + + +def test_parse_url_valid(): + test_cases = [ + "https://github.com/user/repo", + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo" + ] + for url in test_cases: + result = parse_url(url) + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["url"] == url + +def test_parse_url_invalid(): + url = "https://only-domain.com" + with pytest.raises(ValueError, match="Invalid repository URL"): + parse_url(url) + +def test_parse_query_basic(): + test_cases = [ + "https://github.com/user/repo", + "https://gitlab.com/user/repo" + ] + for url in test_cases: + result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["url"] == url + assert "*.txt" in result["ignore_patterns"] + +def test_parse_query_include_pattern(): + url = "https://github.com/user/repo" + result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') + assert result["include_patterns"] == ["*.py"] + assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + +def test_parse_query_invalid_pattern(): + url = "https://github.com/user/repo" + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): + parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') + +================================================ +File: /src\gitingest\utils.py +================================================ + +## Async Timeout decorator +import asyncio +import functools +from typing import TypeVar, Callable + +T = TypeVar("T") + +class AsyncTimeoutError(Exception): + """Raised when an async operation exceeds its timeout limit.""" + pass + +def async_timeout(seconds: int = 10): + def decorator(func: Callable[..., T]) -> Callable[..., T]: + @functools.wraps(func) + async def wrapper(*args, **kwargs) -> T: + try: + return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) + except asyncio.TimeoutError: + raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") + return wrapper + return decorator + +================================================ +File: /src\gitingest\__init__.py +================================================ +from .ingest import ingest +from .parse_query import parse_query +from .clone import clone_repo +from .ingest_from_query import ingest_from_query + +__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query'] + +================================================ +File: /src\main.py +================================================ +import os +from dotenv import load_dotenv + +from fastapi import FastAPI, Request +from fastapi.templating import Jinja2Templates +from fastapi.responses import HTMLResponse, FileResponse, Response +from fastapi.staticfiles import StaticFiles +from starlette.middleware.trustedhost import TrustedHostMiddleware +from api_analytics.fastapi import Analytics +from slowapi import _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded + +from server_utils import limiter +from routers import download, dynamic, index + + +load_dotenv() + +app = FastAPI() +app.state.limiter = limiter +app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + +app.mount("/static", StaticFiles(directory="static"), name="static") +app.add_middleware(Analytics, api_key=os.getenv('API_ANALYTICS_KEY')) + +# Define the default allowed hosts +default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] + +# Fetch allowed hosts from the environment variable or use the default +allowed_hosts = os.getenv("ALLOWED_HOSTS") +if allowed_hosts: + allowed_hosts = allowed_hosts.split(",") +else: + allowed_hosts = default_allowed_hosts + +app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) +templates = Jinja2Templates(directory="templates") + +@app.get("/health") +async def health_check(): + return {"status": "healthy"} + +@app.head("/") +async def head_root(): + """Mirror the headers and status code of the index page""" + return HTMLResponse( + content=None, + headers={ + "content-type": "text/html; charset=utf-8" + } + ) + +@app.get("/api/", response_class=HTMLResponse) +@app.get("/api", response_class=HTMLResponse) +async def api_docs(request: Request): + return templates.TemplateResponse( + "api.jinja", {"request": request} + ) + +@app.get("/robots.txt") +async def robots(): + return FileResponse('static/robots.txt') + +app.include_router(index) +app.include_router(download) +app.include_router(dynamic) + +================================================ +File: /src\process_query.py +================================================ +from typing import List +from fastapi.templating import Jinja2Templates +from fastapi import Request + +from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS +from gitingest import ingest_from_query, clone_repo, parse_query +from server_utils import logSliderToSize, Colors + +templates = Jinja2Templates(directory="templates") + +def print_query(query, request, max_file_size, pattern_type, pattern): + print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") + if int(max_file_size/1024) != 50: + print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") + if pattern_type == "include" and pattern != "": + print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + elif pattern_type == "exclude" and pattern != "": + print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + + +def print_error(query, request, e, max_file_size, pattern_type, pattern): + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print_query(query, request, max_file_size, pattern_type, pattern) + print(f" | {Colors.RED}{e}{Colors.END}") + +def print_success(query, request, max_file_size, pattern_type, pattern, summary): + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") + print_query(query, request, max_file_size, pattern_type, pattern) + print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") + + + +async def process_query(request: Request, input_text: str, slider_position: int, pattern_type: str = "exclude", pattern: str = "", is_index: bool = False) -> str: + template = "index.jinja" if is_index else "github.jinja" + max_file_size = logSliderToSize(slider_position) + if pattern_type == "include": + include_patterns = pattern + exclude_patterns = None + elif pattern_type == "exclude": + exclude_patterns = pattern + include_patterns = None + try: + query = parse_query(input_text, max_file_size, True, include_patterns, exclude_patterns) + await clone_repo(query) + summary, tree, content = ingest_from_query(query) + with open(f"{query['local_path']}.txt", "w") as f: + f.write(tree + "\n" + content) + + + + except Exception as e: + #hack to print error message when query is not defined + if 'query' in locals() and query is not None and isinstance(query, dict): + print_error(query, request, e, max_file_size, pattern_type, pattern) + else: + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + print(f"{Colors.RED}{e}{Colors.END}") + return templates.TemplateResponse( + template, + { + "request": request, + "github_url": input_text, + "error_message": f"Error: {e}", + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } + ) + + if len(content) > MAX_DISPLAY_SIZE: + content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + print_success(query, request, max_file_size, pattern_type, pattern, summary) + return templates.TemplateResponse( + template, + { + "request": request, + "github_url": input_text, + "result": True, + "summary": summary, + "tree": tree, + "content": content, + "examples": EXAMPLE_REPOS if is_index else [], + "ingest_id": query['id'], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } + ) + + +================================================ +File: /src\routers\download.py +================================================ +from fastapi import HTTPException, APIRouter +from fastapi.responses import Response +from config import TMP_BASE_PATH +import os + +router = APIRouter() + +@router.get("/download/{digest_id}") +async def download_ingest(digest_id: str): + try: + # Find the first .txt file in the directory + directory = f"{TMP_BASE_PATH}/{digest_id}" + txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] + + if not txt_files: + raise FileNotFoundError("No .txt file found") + + with open(f"{directory}/{txt_files[0]}", "r") as f: + content = f.read() + + return Response( + content=content, + media_type="text/plain", + headers={ + "Content-Disposition": f"attachment; filename={txt_files[0]}" + } + ) + except FileNotFoundError: + raise HTTPException(status_code=404, detail="Digest not found") + +================================================ +File: /src\routers\dynamic.py +================================================ +from fastapi import APIRouter, Request, Form +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from process_query import process_query +from server_utils import limiter + +router = APIRouter() +templates = Jinja2Templates(directory="templates") + +@router.get("/{full_path:path}") +async def catch_all(request: Request, full_path: str): + return templates.TemplateResponse( + "github.jinja", + { + "request": request, + "github_url": f"https://github.com/{full_path}", + "loading": True, + "default_file_size": 243 + } + ) + +@router.post("/{full_path:path}", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def process_catch_all( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...) +): + return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=False) + + +================================================ +File: /src\routers\index.py +================================================ +from fastapi import APIRouter, Request, Form +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from server_utils import limiter +from process_query import process_query +from config import EXAMPLE_REPOS + + +router = APIRouter() +templates = Jinja2Templates(directory="templates") + + +@router.get("/", response_class=HTMLResponse) +async def home(request: Request): + return templates.TemplateResponse( + "index.jinja", + { + "request": request, + "examples": EXAMPLE_REPOS, + "default_file_size": 243 + } + ) + + +@router.post("/", response_class=HTMLResponse) +@limiter.limit("10/minute") +async def index_post( + request: Request, + input_text: str = Form(...), + max_file_size: int = Form(...), + pattern_type: str = Form(...), + pattern: str = Form(...) +): + return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=True) + + + + + + + +================================================ +File: /src\routers\__init__.py +================================================ +from .download import router as download +from .dynamic import router as dynamic +from .index import router as index + +__all__ = ["download", "dynamic", "index"] + +================================================ +File: /src\server_utils.py +================================================ +## Rate Limiter +from slowapi import Limiter +from slowapi.util import get_remote_address +limiter = Limiter(key_func=get_remote_address) + +## Logarithmic slider to file size +import math +def logSliderToSize(position): + """Convert slider position to file size in KB""" + maxp = 500 + minv = math.log(1) + maxv = math.log(102400) + + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + +## Color printing utility +class Colors: + """ANSI color codes""" + BLACK = "\033[0;30m" + RED = "\033[0;31m" + GREEN = "\033[0;32m" + BROWN = "\033[0;33m" + BLUE = "\033[0;34m" + PURPLE = "\033[0;35m" + CYAN = "\033[0;36m" + LIGHT_GRAY = "\033[0;37m" + DARK_GRAY = "\033[1;30m" + LIGHT_RED = "\033[1;31m" + LIGHT_GREEN = "\033[1;32m" + YELLOW = "\033[1;33m" + LIGHT_BLUE = "\033[1;34m" + LIGHT_PURPLE = "\033[1;35m" + LIGHT_CYAN = "\033[1;36m" + WHITE = "\033[1;37m" + BOLD = "\033[1m" + FAINT = "\033[2m" + ITALIC = "\033[3m" + UNDERLINE = "\033[4m" + BLINK = "\033[5m" + NEGATIVE = "\033[7m" + CROSSED = "\033[9m" + END = "\033[0m" + + +================================================ +File: /src\static\js\snow.js +================================================ +// Snow effect initialization +function initSnow() { + const snowCanvas = document.getElementById('snow-canvas'); + const ctx = snowCanvas.getContext('2d'); + + // Configure snow + const snowflakes = []; + const maxSnowflakes = 50; + const spawnInterval = 200; + let currentSnowflakes = 0; + let lastSpawnTime = 0; + + // Resize canvas to window size + function resizeCanvas() { + snowCanvas.width = window.innerWidth; + snowCanvas.height = window.innerHeight; + } + + // Initial setup + resizeCanvas(); + window.addEventListener('resize', resizeCanvas); + + // Snowflake class definition + class Snowflake { + constructor() { + this.reset(); + } + + reset() { + this.x = Math.random() * snowCanvas.width; + this.y = 0; + this.size = Math.random() * 3 + 2; + this.speed = Math.random() * 1 + 0.5; + this.wind = Math.random() * 0.5 - 0.25; + } + + update() { + this.y += this.speed; + this.x += this.wind; + + if (this.y > snowCanvas.height) { + this.reset(); + } + } + + draw() { + ctx.save(); + + ctx.shadowColor = 'rgba(0, 0, 0, 0.3)'; + ctx.shadowBlur = 5; + ctx.shadowOffsetX = 2; + ctx.shadowOffsetY = 2; + + ctx.beginPath(); + ctx.arc(this.x, this.y, this.size, 0, Math.PI * 2); + ctx.fillStyle = 'rgba(255, 255, 255, 1)'; + ctx.fill(); + + ctx.strokeStyle = 'rgba(200, 200, 200, 0.8)'; + ctx.lineWidth = 0.5; + ctx.stroke(); + + ctx.restore(); + } + } + + function animate(currentTime) { + ctx.clearRect(0, 0, snowCanvas.width, snowCanvas.height); + + if (currentSnowflakes < maxSnowflakes && currentTime - lastSpawnTime > spawnInterval) { + snowflakes.push(new Snowflake()); + currentSnowflakes++; + lastSpawnTime = currentTime; + } + + snowflakes.forEach(snowflake => { + snowflake.update(); + snowflake.draw(); + }); + + requestAnimationFrame(animate); + } + + requestAnimationFrame(animate); +} + +// Initialize snow when DOM content is loaded +document.addEventListener('DOMContentLoaded', initSnow); + +// Also initialize when the HTMX content is swapped +document.addEventListener('htmx:afterSettle', initSnow); + +================================================ +File: /src\static\js\utils.js +================================================ +// Copy functionality +function copyText(className) { + const textarea = document.querySelector('.' + className); + const button = document.querySelector(`button[onclick="copyText('${className}')"]`); + if (!textarea || !button) return; + + // Copy text + navigator.clipboard.writeText(textarea.value) + .then(() => { + // Store original content + const originalContent = button.innerHTML; + + // Change button content + button.innerHTML = 'Copied!'; + + // Reset after 1 second + setTimeout(() => { + button.innerHTML = originalContent; + }, 1000); + }) + .catch(err => { + // Show error in button + const originalContent = button.innerHTML; + button.innerHTML = 'Failed to copy'; + setTimeout(() => { + button.innerHTML = originalContent; + }, 1000); + }); +} + + +function handleSubmit(event, showLoading = false) { + event.preventDefault(); + const form = event.target || document.getElementById('ingestForm'); + if (!form) return; + + const submitButton = form.querySelector('button[type="submit"]'); + if (!submitButton) return; + + const formData = new FormData(form); + + // Update file size + const slider = document.getElementById('file_size'); + if (slider) { + formData.delete('max_file_size'); + formData.append('max_file_size', slider.value); + } + + // Update pattern type and pattern + const patternType = document.getElementById('pattern_type'); + const pattern = document.getElementById('pattern'); + if (patternType && pattern) { + formData.delete('pattern_type'); + formData.delete('pattern'); + formData.append('pattern_type', patternType.value); + formData.append('pattern', pattern.value); + } + + const originalContent = submitButton.innerHTML; + const currentStars = document.getElementById('github-stars')?.textContent; + + if (showLoading) { + submitButton.disabled = true; + submitButton.innerHTML = ` +
+ + + + + Processing... +
+ `; + submitButton.classList.add('bg-[#ffb14d]'); + } + + // Submit the form + fetch(form.action, { + method: 'POST', + body: formData + }) + .then(response => response.text()) + .then(html => { + // Store the star count before updating the DOM + const starCount = currentStars; + + + // TEMPORARY SNOW LOGIC // + const parser = new DOMParser(); + const newDoc = parser.parseFromString(html, 'text/html'); + + const existingCanvas = document.getElementById('snow-canvas'); + document.body.innerHTML = newDoc.body.innerHTML; + if (existingCanvas) { + document.body.insertBefore(existingCanvas, document.body.firstChild); + } + // END TEMPORARY SNOW LOGIC // + + // Wait for next tick to ensure DOM is updated + setTimeout(() => { + // Reinitialize slider functionality + initializeSlider(); + + const starsElement = document.getElementById('github-stars'); + if (starsElement && starCount) { + starsElement.textContent = starCount; + } + + // Scroll to results if they exist + const resultsSection = document.querySelector('[data-results]'); + if (resultsSection) { + resultsSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); + } + }, 0); + }) + .catch(error => { + submitButton.disabled = false; + submitButton.innerHTML = originalContent; + }); +} + +function copyFullDigest() { + const directoryStructure = document.querySelector('.directory-structure').value; + const filesContent = document.querySelector('.result-text').value; + const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; + const button = document.querySelector('[onclick="copyFullDigest()"]'); + const originalText = button.innerHTML; + + navigator.clipboard.writeText(fullDigest).then(() => { + button.innerHTML = ` + + + + Copied! + `; + + setTimeout(() => { + button.innerHTML = originalText; + }, 2000); + }).catch(err => { + console.error('Failed to copy text: ', err); + }); +} + +// Add the logSliderToSize helper function +function logSliderToSize(position) { + const minp = 0; + const maxp = 500; + const minv = Math.log(1); + const maxv = Math.log(102400); + + const value = Math.exp(minv + (maxv - minv) * Math.pow(position / maxp, 1.5)); + return Math.round(value); +} + +// Move slider initialization to a separate function +function initializeSlider() { + const slider = document.getElementById('file_size'); + const sizeValue = document.getElementById('size_value'); + + if (!slider || !sizeValue) return; + + function updateSlider() { + const value = logSliderToSize(slider.value); + sizeValue.textContent = formatSize(value); + slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; + } + + // Update on slider change + slider.addEventListener('input', updateSlider); + + // Initialize slider position + updateSlider(); +} + +// Add helper function for formatting size +function formatSize(sizeInKB) { + if (sizeInKB >= 1024) { + return Math.round(sizeInKB / 1024) + 'mb'; + } + return Math.round(sizeInKB) + 'kb'; +} + +// Initialize slider on page load +document.addEventListener('DOMContentLoaded', initializeSlider); + +// Make sure these are available globally +window.copyText = copyText; + +window.handleSubmit = handleSubmit; +window.initializeSlider = initializeSlider; +window.formatSize = formatSize; + +// Add this new function +function setupGlobalEnterHandler() { + document.addEventListener('keydown', function (event) { + if (event.key === 'Enter' && !event.target.matches('textarea')) { + const form = document.getElementById('ingestForm'); + if (form) { + handleSubmit(new Event('submit'), true); + } + } + }); +} + +// Add to the DOMContentLoaded event listener +document.addEventListener('DOMContentLoaded', () => { + initializeSlider(); + setupGlobalEnterHandler(); +}); + + +================================================ +File: /src\static\robots.txt +================================================ +User-agent: * +Allow: / +Allow: /api/ +Allow: /cyclotruc/gitingest/ + + + +================================================ +File: /src\templates\api.jinja +================================================ +{% extends "base.jinja" %} + +{% block title %}Git ingest API{% endblock %} + +{% block content %} +
+
+
+

API Documentation

+ + +
+
+
+
+ + + +
+
+

+ The API is currently under development.. +

+
+
+
+

+ We're working on making our API available to the public. + In the meantime, you can + + open an issue on github + + to suggest features. +

+
+
+
+{% endblock %} + +================================================ +File: /src\templates\base.jinja +================================================ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {% block title %}Git ingest{% endblock %} + + + + + + {% block extra_head %}{% endblock %} + + + + + {% include 'components/navbar.jinja' %} + + +
+
+ {% block content %}{% endblock %} +
+
+ + {% include 'components/footer.jinja' %} + + {% block extra_scripts %}{% endblock %} + + + +================================================ +File: /src\templates\components\footer.jinja +================================================ + + +================================================ +File: /src\templates\components\github_form.jinja +================================================ +
+
+
+ + +
+
+
+ +
+
+
+ +
+ + +
+ +
+ +
+
+
+
+
+ + + + +
+ +
+
+
+ +
+ + +
+
+ + {% if show_examples %} + +
+

Try these example repositories:

+
+ {% for example in examples %} + + {% endfor %} +
+
+ {% endif %} +
+
+ +================================================ +File: /src\templates\components\navbar.jinja +================================================ + + +
+
+
+ + + + + +
+
+
+ +================================================ +File: /src\templates\components\result.jinja +================================================ +{% if result %} +
+
+
+
+ +
+ +
+
+

Summary

+
+ + +
+
+
+ +
+ {% if ingest_id %} + +
+
+
+ +
+ {% endif %} + + +
+ + +
+
+

Directory Structure

+
+
+
+ +
+
+
+
+
+ +
+
+
+ + +
+
+

Files Content

+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+{% endif %} + +================================================ +File: /src\templates\github.jinja +================================================ +{% extends "base.jinja" %} + +{% block content %} +{% if error_message %} +
+ {{ error_message }} +
+{% endif %} + +{% with is_index=true, show_examples=false %} + {% include 'components/github_form.jinja' %} +{% endwith %} + +{% if loading %} +
+
+
+
+

Loading...

+
+
+{% endif %} + +{% include 'components/result.jinja' %} +{% endblock content %} + +{% block extra_scripts %} + +{% endblock extra_scripts %} + +================================================ +File: /src\templates\index.jinja +================================================ +{% extends "base.jinja" %} + +{% block extra_head %} + +{% endblock %} + +{% block content %} +
+
+ + + + +

+ Prompt-friendly
codebase  +

+ +
+

+ Turn any Git repository into a simple text ingest of its codebase. +

+

+ This is useful for feeding a codebase into any LLM. +

+

+ You can also replace 'hub' with 'ingest' in any Github URL +

+
+ +{% if error_message %} +
+ {{ error_message }} +
+{% endif %} + +{% with is_index=true, show_examples=true %} + {% include 'components/github_form.jinja' %} +{% endwith %} + +{% include 'components/result.jinja' %} + + + + +{% endblock %} + diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index ed84b214..07417b94 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,6 +1,6 @@ -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query from .ingest import ingest +from .parse_query import parse_query +from .clone import clone_repo +from .ingest_from_query import ingest_from_query -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] \ No newline at end of file +__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query'] \ No newline at end of file diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 81823e63..0886c638 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,10 +1,15 @@ import os import pathlib import click +import sys +from .encoding import setup_encoding -from gitingest.ingest import ingest -from gitingest.ingest_from_query import MAX_FILE_SIZE -from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS +# Setup encoding first +setup_encoding() + +# Define constants +MAX_FILE_SIZE = 51200 # 50KB by default +DEFAULT_IGNORE_PATTERNS = [] def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() @@ -15,21 +20,52 @@ def normalize_pattern(pattern: str) -> str: @click.command() @click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') +@click.option('--output', '-o', default=None, + help='Output file path (default: .txt in current directory)') +@click.option('--max-size', '-s', default=MAX_FILE_SIZE, + help='Maximum file size to process in bytes') +@click.option('--exclude-pattern', '-e', multiple=True, + help='Patterns to exclude') +@click.option('--include-pattern', '-i', multiple=True, + help='Patterns to include') def main(source, output, max_size, exclude_pattern, include_pattern): """Analyze a directory and create a text dump of its contents.""" try: - # Combine default and custom ignore patterns + from gitingest.ingest import ingest + + # Convert paths to absolute with proper encoding + source = str(pathlib.Path(source).resolve()) + + # Handle patterns exclude_patterns = list(exclude_pattern) include_patterns = list(set(include_pattern)) + # Set default output name if not output: output = "digest.txt" - summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) + output = str(pathlib.Path(output).resolve()) + + # Call ingest with encoding awareness + summary, tree, content = ingest( + source, + max_size, + include_patterns, + exclude_patterns, + output=output + ) + + # Write output with explicit encoding + with open(output, 'w', encoding='utf-8', errors='replace') as f: + if isinstance(summary, bytes): + summary = summary.decode('utf-8', errors='replace') + if isinstance(tree, bytes): + tree = tree.decode('utf-8', errors='replace') + if isinstance(content, bytes): + content = content.decode('utf-8', errors='replace') + + f.write(f"{summary}\n\n{tree}\n\n{content}") + # Print messages with encoding handling click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) @@ -39,4 +75,4 @@ def main(source, output, max_size, exclude_pattern, include_pattern): raise click.Abort() if __name__ == '__main__': - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/src/gitingest/encoding.py b/src/gitingest/encoding.py new file mode 100644 index 00000000..f4e10578 --- /dev/null +++ b/src/gitingest/encoding.py @@ -0,0 +1,17 @@ +import sys +import io +import codecs + +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper( + sys.stdout.buffer, + encoding='utf-8', + errors='replace' + ) + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper( + sys.stderr.buffer, + encoding='utf-8', + errors='replace' + ) \ No newline at end of file diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index eac20818..6b3e957a 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -2,12 +2,40 @@ import shutil from typing import Union, List from pathlib import Path +import io +import sys -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query +# Import other modules from the package +from gitingest.parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str: +def setup_encoding(): + if sys.stdout.encoding != 'utf-8': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + if sys.stderr.encoding != 'utf-8': + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') + +def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str] = None, + exclude_patterns: Union[List[str], str] = None, + output: str = None) -> tuple[str, str, str]: + """ + Analyze and create a text dump of source contents. + + Args: + source: Path to source directory or git URL + max_file_size: Maximum file size to process in bytes + include_patterns: Patterns to include in analysis + exclude_patterns: Patterns to exclude from analysis + output: Output file path + + Returns: + Tuple of (summary, tree, content) + """ + setup_encoding() + query = None + try: query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) if query['url']: @@ -16,13 +44,31 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: summary, tree, content = ingest_from_query(query) if output: - with open(f"{output}", "w") as f: - f.write(tree + "\n" + content) + # Write with explicit UTF-8 encoding + with open(output, "w", encoding='utf-8', errors='replace') as f: + # Ensure all content is properly encoded + tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree + content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content + f.write(f"{tree}\n{content}") return summary, tree, content + + except UnicodeEncodeError as e: + # Handle encoding errors specifically + error_msg = f"Encoding error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + + except Exception as e: + # Handle other errors + error_msg = f"Error while processing {source}: {str(e)}" + raise RuntimeError(error_msg) + finally: # Clean up the temporary directory if it was created - if query['url']: + if query and query.get('url'): # Get parent directory two levels up from local_path (../tmp) cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) \ No newline at end of file + try: + shutil.rmtree(cleanup_path, ignore_errors=True) + except Exception as e: + print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr) \ No newline at end of file From ee523cedc74ba97f301df6ff03b85fd55c5ff8b0 Mon Sep 17 00:00:00 2001 From: Dai Hung PHAM Date: Fri, 27 Dec 2024 11:27:30 +0100 Subject: [PATCH 3/6] add digest.txt to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e98f538f..1cad9b5e 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ Caddyfile # ignore default output directory tmp/* +digest.txt \ No newline at end of file From 880b6c33e33738fc43ae4d4d02dfbf28c8f43e02 Mon Sep 17 00:00:00 2001 From: Dai Hung PHAM Date: Sat, 28 Dec 2024 17:02:24 +0100 Subject: [PATCH 4/6] delete digest.txt --- digest.txt | 4155 ---------------------------------------------------- 1 file changed, 4155 deletions(-) delete mode 100644 digest.txt diff --git a/digest.txt b/digest.txt deleted file mode 100644 index bbc85aa4..00000000 --- a/digest.txt +++ /dev/null @@ -1,4155 +0,0 @@ -Repository: __temp/gitingest -Files analyzed: 76 - -Estimated tokens: 35.5k - -Directory structure: -└── __temp/gitingest/ - ├── .dockerignore - ├── .env - ├── .git/ - │ ├── config - │ ├── description - │ ├── HEAD - │ ├── hooks/ - │ │ ├── applypatch-msg.sample - │ │ ├── commit-msg.sample - │ │ ├── fsmonitor-watchman.sample - │ │ ├── post-update.sample - │ │ ├── pre-applypatch.sample - │ │ ├── pre-commit.sample - │ │ ├── pre-merge-commit.sample - │ │ ├── pre-push.sample - │ │ ├── pre-rebase.sample - │ │ ├── pre-receive.sample - │ │ ├── prepare-commit-msg.sample - │ │ ├── push-to-checkout.sample - │ │ ├── sendemail-validate.sample - │ │ └── update.sample - │ ├── index - │ ├── info/ - │ │ └── exclude - │ ├── logs/ - │ │ ├── HEAD - │ │ └── refs/ - │ │ ├── heads/ - │ │ │ └── main - │ │ └── remotes/ - │ │ └── origin/ - │ │ └── HEAD - │ ├── objects/ - │ │ ├── info/ - │ │ └── pack/ - │ │ ├── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.idx - │ │ ├── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.pack - │ │ └── pack-c963e3b12abfebe0a3e8789a988f3557eb073ce4.rev - │ ├── packed-refs - │ └── refs/ - │ ├── heads/ - │ │ └── main - │ ├── remotes/ - │ │ └── origin/ - │ │ └── HEAD - │ └── tags/ - ├── .github/ - │ └── workflows/ - │ └── unitest.yml - ├── .gitignore - ├── .venv/ - │ ├── Lib/ - │ │ └── site-packages/ - │ │ ├── charset_normalizer/ - │ │ ├── httptools/ - │ │ │ └── parser/ - │ │ ├── markupsafe/ - │ │ ├── pydantic_core/ - │ │ ├── regex/ - │ │ ├── tiktoken/ - │ │ ├── watchfiles/ - │ │ ├── websockets/ - │ │ └── wrapt/ - │ └── Scripts/ - │ ├── python.exe - │ └── uvicorn.exe - ├── CODE_OF_CONDUCT.md - ├── digest.txt - ├── Dockerfile - ├── docs/ - ├── LICENSE - ├── pytest.ini - ├── README.md - ├── requirements.txt - ├── SECURITY.md - ├── setup.py - └── src/ - ├── config.py - ├── gitingest/ - │ ├── cli.py - │ ├── clone.py - │ ├── encoding.py - │ ├── ingest.py - │ ├── ingest_from_query.py - │ ├── parse_query.py - │ ├── tests/ - │ │ ├── conftest.py - │ │ ├── test_clone.py - │ │ ├── test_ingest.py - │ │ ├── test_parse_query.py - │ │ └── __init__.py - │ ├── utils.py - │ ├── __init__.py - │ └── __pycache__/ - ├── main.py - ├── process_query.py - ├── routers/ - │ ├── download.py - │ ├── dynamic.py - │ ├── index.py - │ ├── __init__.py - │ └── __pycache__/ - ├── server_utils.py - ├── static/ - │ ├── js/ - │ │ ├── snow.js - │ │ └── utils.js - │ └── robots.txt - ├── templates/ - │ ├── api.jinja - │ ├── base.jinja - │ ├── components/ - │ │ ├── footer.jinja - │ │ ├── github_form.jinja - │ │ ├── navbar.jinja - │ │ └── result.jinja - │ ├── github.jinja - │ └── index.jinja - ├── __init__.py - └── __pycache__/ - - -================================================ -File: /README.md -================================================ -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com/) - -![License](https://img.shields.io/badge/license-MIT-blue.svg) - -# GitIngest 🔍 -Turn any Git repository into a prompt-friendly text ingest for LLMs. - -You can also replace `hub` with `ingest` in any github url to access the coresponding digest - -[gitingest.com](https://gitingest.com/) - - -## 🚀 Features - -- **Easy code context**: Get a text digest from a git repository URL or a directory -- **Smart Formatting**: Optimized output format for LLM prompts -- **Statistics about**: : - - File and directory structure - - Size of the extract - - Token count -- **CLI tool**: Run it as a command (Currently on Linux only) -- **Python package**: Import it in your code - - -## 📦 Installation - -``` -pip install gitingest -``` - - -## 💡 Command Line usage - -The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. - -```bash -# Basic usage -gitingest /path/to/directory - -# From url -gitingest https://github.com/cyclotruc/gitingest - -# See more options -gitingest --help -``` - -This will write the digest in a text file (default `digest.txt`) in your current working directory. - - -## 🐛 Python package usage - - -```python -from gitingest import ingest - -summary, tree, content = ingest("path/to/directory") - -#or from URL -summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") -``` - -By default, this won't write a file but can be enabled with the `output` argument - - -## 🛠️ Using -- Tailwind CSS - Frontend -- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework -- [tiktoken](https://github.com/openai/tiktoken) - Token estimation -- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics - - -## 🌐 Self-host -1. Build the image: -``` -docker build -t gitingest . -``` - -2. Run the container: -``` -docker run -d --name gitingest -p 8000:8000 gitingest -``` -The application will be available at `http://localhost:8000` -Ensure environment variables are set before running the application or deploying it via Docker. - -## ✔️ Contributing - -Contributions are welcome! - -Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) - -### Ways to contribute - -1. Provide your feedback and ideas on discord -2. Open an Issue on github to report a bug -2. Create a Pull request - - Fork the repository - - Make your changes and test them locally - - Open a pull request for review and feedback - -### 🔧 Local dev - -#### Environment Configuration -- **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. -You can configure the application using the following environment variables: - -```bash -ALLOWED_HOSTS="gitingest.local,localhost" -``` - -#### Run locally -1. Clone the repository -```bash -git clone https://github.com/cyclotruc/gitingest.git -cd gitingest -``` - -2. Install dependencies -```bash -pip install -r requirements.txt -``` - -3. Run the application: -```bash -cd src -uvicorn main:app --reload -``` - -The frontend will be available at `localhost:8000` - - - - -================================================ -File: /.dockerignore -================================================ -# Git -.git -.gitignore - -# Python -__pycache__ -*.pyc -*.pyo -*.pyd -.Python -env -pip-log.txt -pip-delete-this-directory.txt -.tox -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.log - -# Virtual environment -venv -.env -.venv -ENV - -# IDE -.idea -.vscode -*.swp -*.swo - -# Project specific -docs/ -tests/ -*.md -LICENSE -pytest.ini -setup.py - - -================================================ -File: /.env -================================================ -ALLOWED_HOSTS="gitingest.local,localhost" - - -================================================ -File: /.git\config -================================================ -[core] - repositoryformatversion = 0 - filemode = false - bare = false - logallrefupdates = true - symlinks = false - ignorecase = true -[remote "origin"] - url = https://github.com/cyclotruc/gitingest.git - fetch = +refs/heads/*:refs/remotes/origin/* -[branch "main"] - remote = origin - merge = refs/heads/main - vscode-merge-base = origin/main - - -================================================ -File: /.git\description -================================================ -Unnamed repository; edit this file 'description' to name the repository. - - -================================================ -File: /.git\HEAD -================================================ -ref: refs/heads/main - - -================================================ -File: /.git\hooks\applypatch-msg.sample -================================================ -#!/bin/sh -# -# An example hook script to check the commit log message taken by -# applypatch from an e-mail message. -# -# The hook should exit with non-zero status after issuing an -# appropriate message if it wants to stop the commit. The hook is -# allowed to edit the commit message file. -# -# To enable this hook, rename this file to "applypatch-msg". - -. git-sh-setup -commitmsg="$(git rev-parse --git-path hooks/commit-msg)" -test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"} -: - - -================================================ -File: /.git\hooks\commit-msg.sample -================================================ -#!/bin/sh -# -# An example hook script to check the commit log message. -# Called by "git commit" with one argument, the name of the file -# that has the commit message. The hook should exit with non-zero -# status after issuing an appropriate message if it wants to stop the -# commit. The hook is allowed to edit the commit message file. -# -# To enable this hook, rename this file to "commit-msg". - -# Uncomment the below to add a Signed-off-by line to the message. -# Doing this in a hook is a bad idea in general, but the prepare-commit-msg -# hook is more suited to it. -# -# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') -# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1" - -# This example catches duplicate Signed-off-by lines. - -test "" = "$(grep '^Signed-off-by: ' "$1" | - sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || { - echo >&2 Duplicate Signed-off-by lines. - exit 1 -} - - -================================================ -File: /.git\hooks\fsmonitor-watchman.sample -================================================ -#!/usr/bin/perl - -use strict; -use warnings; -use IPC::Open2; - -# An example hook script to integrate Watchman -# (https://facebook.github.io/watchman/) with git to speed up detecting -# new and modified files. -# -# The hook is passed a version (currently 2) and last update token -# formatted as a string and outputs to stdout a new update token and -# all files that have been modified since the update token. Paths must -# be relative to the root of the working tree and separated by a single NUL. -# -# To enable this hook, rename this file to "query-watchman" and set -# 'git config core.fsmonitor .git/hooks/query-watchman' -# -my ($version, $last_update_token) = @ARGV; - -# Uncomment for debugging -# print STDERR "$0 $version $last_update_token\n"; - -# Check the hook interface version -if ($version ne 2) { - die "Unsupported query-fsmonitor hook version '$version'.\n" . - "Falling back to scanning...\n"; -} - -my $git_work_tree = get_working_dir(); - -my $retry = 1; - -my $json_pkg; -eval { - require JSON::XS; - $json_pkg = "JSON::XS"; - 1; -} or do { - require JSON::PP; - $json_pkg = "JSON::PP"; -}; - -launch_watchman(); - -sub launch_watchman { - my $o = watchman_query(); - if (is_work_tree_watched($o)) { - output_result($o->{clock}, @{$o->{files}}); - } -} - -sub output_result { - my ($clockid, @files) = @_; - - # Uncomment for debugging watchman output - # open (my $fh, ">", ".git/watchman-output.out"); - # binmode $fh, ":utf8"; - # print $fh "$clockid\n@files\n"; - # close $fh; - - binmode STDOUT, ":utf8"; - print $clockid; - print "\0"; - local $, = "\0"; - print @files; -} - -sub watchman_clock { - my $response = qx/watchman clock "$git_work_tree"/; - die "Failed to get clock id on '$git_work_tree'.\n" . - "Falling back to scanning...\n" if $? != 0; - - return $json_pkg->new->utf8->decode($response); -} - -sub watchman_query { - my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty') - or die "open2() failed: $!\n" . - "Falling back to scanning...\n"; - - # In the query expression below we're asking for names of files that - # changed since $last_update_token but not from the .git folder. - # - # To accomplish this, we're using the "since" generator to use the - # recency index to select candidate nodes and "fields" to limit the - # output to file names only. Then we're using the "expression" term to - # further constrain the results. - my $last_update_line = ""; - if (substr($last_update_token, 0, 1) eq "c") { - $last_update_token = "\"$last_update_token\""; - $last_update_line = qq[\n"since": $last_update_token,]; - } - my $query = <<" END"; - ["query", "$git_work_tree", {$last_update_line - "fields": ["name"], - "expression": ["not", ["dirname", ".git"]] - }] - END - - # Uncomment for debugging the watchman query - # open (my $fh, ">", ".git/watchman-query.json"); - # print $fh $query; - # close $fh; - - print CHLD_IN $query; - close CHLD_IN; - my $response = do {local $/; }; - - # Uncomment for debugging the watch response - # open ($fh, ">", ".git/watchman-response.json"); - # print $fh $response; - # close $fh; - - die "Watchman: command returned no output.\n" . - "Falling back to scanning...\n" if $response eq ""; - die "Watchman: command returned invalid output: $response\n" . - "Falling back to scanning...\n" unless $response =~ /^\{/; - - return $json_pkg->new->utf8->decode($response); -} - -sub is_work_tree_watched { - my ($output) = @_; - my $error = $output->{error}; - if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) { - $retry--; - my $response = qx/watchman watch "$git_work_tree"/; - die "Failed to make watchman watch '$git_work_tree'.\n" . - "Falling back to scanning...\n" if $? != 0; - $output = $json_pkg->new->utf8->decode($response); - $error = $output->{error}; - die "Watchman: $error.\n" . - "Falling back to scanning...\n" if $error; - - # Uncomment for debugging watchman output - # open (my $fh, ">", ".git/watchman-output.out"); - # close $fh; - - # Watchman will always return all files on the first query so - # return the fast "everything is dirty" flag to git and do the - # Watchman query just to get it over with now so we won't pay - # the cost in git to look up each individual file. - my $o = watchman_clock(); - $error = $output->{error}; - - die "Watchman: $error.\n" . - "Falling back to scanning...\n" if $error; - - output_result($o->{clock}, ("/")); - $last_update_token = $o->{clock}; - - eval { launch_watchman() }; - return 0; - } - - die "Watchman: $error.\n" . - "Falling back to scanning...\n" if $error; - - return 1; -} - -sub get_working_dir { - my $working_dir; - if ($^O =~ 'msys' || $^O =~ 'cygwin') { - $working_dir = Win32::GetCwd(); - $working_dir =~ tr/\\/\//; - } else { - require Cwd; - $working_dir = Cwd::cwd(); - } - - return $working_dir; -} - - -================================================ -File: /.git\hooks\post-update.sample -================================================ -#!/bin/sh -# -# An example hook script to prepare a packed repository for use over -# dumb transports. -# -# To enable this hook, rename this file to "post-update". - -exec git update-server-info - - -================================================ -File: /.git\hooks\pre-applypatch.sample -================================================ -#!/bin/sh -# -# An example hook script to verify what is about to be committed -# by applypatch from an e-mail message. -# -# The hook should exit with non-zero status after issuing an -# appropriate message if it wants to stop the commit. -# -# To enable this hook, rename this file to "pre-applypatch". - -. git-sh-setup -precommit="$(git rev-parse --git-path hooks/pre-commit)" -test -x "$precommit" && exec "$precommit" ${1+"$@"} -: - - -================================================ -File: /.git\hooks\pre-commit.sample -================================================ -#!/bin/sh -# -# An example hook script to verify what is about to be committed. -# Called by "git commit" with no arguments. The hook should -# exit with non-zero status after issuing an appropriate message if -# it wants to stop the commit. -# -# To enable this hook, rename this file to "pre-commit". - -if git rev-parse --verify HEAD >/dev/null 2>&1 -then - against=HEAD -else - # Initial commit: diff against an empty tree object - against=$(git hash-object -t tree /dev/null) -fi - -# If you want to allow non-ASCII filenames set this variable to true. -allownonascii=$(git config --type=bool hooks.allownonascii) - -# Redirect output to stderr. -exec 1>&2 - -# Cross platform projects tend to avoid non-ASCII filenames; prevent -# them from being added to the repository. We exploit the fact that the -# printable range starts at the space character and ends with tilde. -if [ "$allownonascii" != "true" ] && - # Note that the use of brackets around a tr range is ok here, (it's - # even required, for portability to Solaris 10's /usr/bin/tr), since - # the square bracket bytes happen to fall in the designated range. - test $(git diff-index --cached --name-only --diff-filter=A -z $against | - LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0 -then - cat <<\EOF -Error: Attempt to add a non-ASCII file name. - -This can cause problems if you want to work with people on other platforms. - -To be portable it is advisable to rename the file. - -If you know what you are doing you can disable this check using: - - git config hooks.allownonascii true -EOF - exit 1 -fi - -# If there are whitespace errors, print the offending file names and fail. -exec git diff-index --check --cached $against -- - - -================================================ -File: /.git\hooks\pre-merge-commit.sample -================================================ -#!/bin/sh -# -# An example hook script to verify what is about to be committed. -# Called by "git merge" with no arguments. The hook should -# exit with non-zero status after issuing an appropriate message to -# stderr if it wants to stop the merge commit. -# -# To enable this hook, rename this file to "pre-merge-commit". - -. git-sh-setup -test -x "$GIT_DIR/hooks/pre-commit" && - exec "$GIT_DIR/hooks/pre-commit" -: - - -================================================ -File: /.git\hooks\pre-push.sample -================================================ -#!/bin/sh - -# An example hook script to verify what is about to be pushed. Called by "git -# push" after it has checked the remote status, but before anything has been -# pushed. If this script exits with a non-zero status nothing will be pushed. -# -# This hook is called with the following parameters: -# -# $1 -- Name of the remote to which the push is being done -# $2 -- URL to which the push is being done -# -# If pushing without using a named remote those arguments will be equal. -# -# Information about the commits which are being pushed is supplied as lines to -# the standard input in the form: -# -# -# -# This sample shows how to prevent push of commits where the log message starts -# with "WIP" (work in progress). - -remote="$1" -url="$2" - -zero=$(git hash-object --stdin &2 "Found WIP commit in $local_ref, not pushing" - exit 1 - fi - fi -done - -exit 0 - - -================================================ -File: /.git\hooks\pre-rebase.sample -================================================ -#!/bin/sh -# -# Copyright (c) 2006, 2008 Junio C Hamano -# -# The "pre-rebase" hook is run just before "git rebase" starts doing -# its job, and can prevent the command from running by exiting with -# non-zero status. -# -# The hook is called with the following parameters: -# -# $1 -- the upstream the series was forked from. -# $2 -- the branch being rebased (or empty when rebasing the current branch). -# -# This sample shows how to prevent topic branches that are already -# merged to 'next' branch from getting rebased, because allowing it -# would result in rebasing already published history. - -publish=next -basebranch="$1" -if test "$#" = 2 -then - topic="refs/heads/$2" -else - topic=`git symbolic-ref HEAD` || - exit 0 ;# we do not interrupt rebasing detached HEAD -fi - -case "$topic" in -refs/heads/??/*) - ;; -*) - exit 0 ;# we do not interrupt others. - ;; -esac - -# Now we are dealing with a topic branch being rebased -# on top of master. Is it OK to rebase it? - -# Does the topic really exist? -git show-ref -q "$topic" || { - echo >&2 "No such branch $topic" - exit 1 -} - -# Is topic fully merged to master? -not_in_master=`git rev-list --pretty=oneline ^master "$topic"` -if test -z "$not_in_master" -then - echo >&2 "$topic is fully merged to master; better remove it." - exit 1 ;# we could allow it, but there is no point. -fi - -# Is topic ever merged to next? If so you should not be rebasing it. -only_next_1=`git rev-list ^master "^$topic" ${publish} | sort` -only_next_2=`git rev-list ^master ${publish} | sort` -if test "$only_next_1" = "$only_next_2" -then - not_in_topic=`git rev-list "^$topic" master` - if test -z "$not_in_topic" - then - echo >&2 "$topic is already up to date with master" - exit 1 ;# we could allow it, but there is no point. - else - exit 0 - fi -else - not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"` - /usr/bin/perl -e ' - my $topic = $ARGV[0]; - my $msg = "* $topic has commits already merged to public branch:\n"; - my (%not_in_next) = map { - /^([0-9a-f]+) /; - ($1 => 1); - } split(/\n/, $ARGV[1]); - for my $elem (map { - /^([0-9a-f]+) (.*)$/; - [$1 => $2]; - } split(/\n/, $ARGV[2])) { - if (!exists $not_in_next{$elem->[0]}) { - if ($msg) { - print STDERR $msg; - undef $msg; - } - print STDERR " $elem->[1]\n"; - } - } - ' "$topic" "$not_in_next" "$not_in_master" - exit 1 -fi - -<<\DOC_END - -This sample hook safeguards topic branches that have been -published from being rewound. - -The workflow assumed here is: - - * Once a topic branch forks from "master", "master" is never - merged into it again (either directly or indirectly). - - * Once a topic branch is fully cooked and merged into "master", - it is deleted. If you need to build on top of it to correct - earlier mistakes, a new topic branch is created by forking at - the tip of the "master". This is not strictly necessary, but - it makes it easier to keep your history simple. - - * Whenever you need to test or publish your changes to topic - branches, merge them into "next" branch. - -The script, being an example, hardcodes the publish branch name -to be "next", but it is trivial to make it configurable via -$GIT_DIR/config mechanism. - -With this workflow, you would want to know: - -(1) ... if a topic branch has ever been merged to "next". Young - topic branches can have stupid mistakes you would rather - clean up before publishing, and things that have not been - merged into other branches can be easily rebased without - affecting other people. But once it is published, you would - not want to rewind it. - -(2) ... if a topic branch has been fully merged to "master". - Then you can delete it. More importantly, you should not - build on top of it -- other people may already want to - change things related to the topic as patches against your - "master", so if you need further changes, it is better to - fork the topic (perhaps with the same name) afresh from the - tip of "master". - -Let's look at this example: - - o---o---o---o---o---o---o---o---o---o "next" - / / / / - / a---a---b A / / - / / / / - / / c---c---c---c B / - / / / \ / - / / / b---b C \ / - / / / / \ / - ---o---o---o---o---o---o---o---o---o---o---o "master" - - -A, B and C are topic branches. - - * A has one fix since it was merged up to "next". - - * B has finished. It has been fully merged up to "master" and "next", - and is ready to be deleted. - - * C has not merged to "next" at all. - -We would want to allow C to be rebased, refuse A, and encourage -B to be deleted. - -To compute (1): - - git rev-list ^master ^topic next - git rev-list ^master next - - if these match, topic has not merged in next at all. - -To compute (2): - - git rev-list master..topic - - if this is empty, it is fully merged to "master". - -DOC_END - - -================================================ -File: /.git\hooks\pre-receive.sample -================================================ -#!/bin/sh -# -# An example hook script to make use of push options. -# The example simply echoes all push options that start with 'echoback=' -# and rejects all pushes when the "reject" push option is used. -# -# To enable this hook, rename this file to "pre-receive". - -if test -n "$GIT_PUSH_OPTION_COUNT" -then - i=0 - while test "$i" -lt "$GIT_PUSH_OPTION_COUNT" - do - eval "value=\$GIT_PUSH_OPTION_$i" - case "$value" in - echoback=*) - echo "echo from the pre-receive-hook: ${value#*=}" >&2 - ;; - reject) - exit 1 - esac - i=$((i + 1)) - done -fi - - -================================================ -File: /.git\hooks\prepare-commit-msg.sample -================================================ -#!/bin/sh -# -# An example hook script to prepare the commit log message. -# Called by "git commit" with the name of the file that has the -# commit message, followed by the description of the commit -# message's source. The hook's purpose is to edit the commit -# message file. If the hook fails with a non-zero status, -# the commit is aborted. -# -# To enable this hook, rename this file to "prepare-commit-msg". - -# This hook includes three examples. The first one removes the -# "# Please enter the commit message..." help message. -# -# The second includes the output of "git diff --name-status -r" -# into the message, just before the "git status" output. It is -# commented because it doesn't cope with --amend or with squashed -# commits. -# -# The third example adds a Signed-off-by line to the message, that can -# still be edited. This is rarely a good idea. - -COMMIT_MSG_FILE=$1 -COMMIT_SOURCE=$2 -SHA1=$3 - -/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE" - -# case "$COMMIT_SOURCE,$SHA1" in -# ,|template,) -# /usr/bin/perl -i.bak -pe ' -# print "\n" . `git diff --cached --name-status -r` -# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;; -# *) ;; -# esac - -# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') -# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE" -# if test -z "$COMMIT_SOURCE" -# then -# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE" -# fi - - -================================================ -File: /.git\hooks\push-to-checkout.sample -================================================ -#!/bin/sh - -# An example hook script to update a checked-out tree on a git push. -# -# This hook is invoked by git-receive-pack(1) when it reacts to git -# push and updates reference(s) in its repository, and when the push -# tries to update the branch that is currently checked out and the -# receive.denyCurrentBranch configuration variable is set to -# updateInstead. -# -# By default, such a push is refused if the working tree and the index -# of the remote repository has any difference from the currently -# checked out commit; when both the working tree and the index match -# the current commit, they are updated to match the newly pushed tip -# of the branch. This hook is to be used to override the default -# behaviour; however the code below reimplements the default behaviour -# as a starting point for convenient modification. -# -# The hook receives the commit with which the tip of the current -# branch is going to be updated: -commit=$1 - -# It can exit with a non-zero status to refuse the push (when it does -# so, it must not modify the index or the working tree). -die () { - echo >&2 "$*" - exit 1 -} - -# Or it can make any necessary changes to the working tree and to the -# index to bring them to the desired state when the tip of the current -# branch is updated to the new commit, and exit with a zero status. -# -# For example, the hook can simply run git read-tree -u -m HEAD "$1" -# in order to emulate git fetch that is run in the reverse direction -# with git push, as the two-tree form of git read-tree -u -m is -# essentially the same as git switch or git checkout that switches -# branches while keeping the local changes in the working tree that do -# not interfere with the difference between the branches. - -# The below is a more-or-less exact translation to shell of the C code -# for the default behaviour for git's push-to-checkout hook defined in -# the push_to_deploy() function in builtin/receive-pack.c. -# -# Note that the hook will be executed from the repository directory, -# not from the working tree, so if you want to perform operations on -# the working tree, you will have to adapt your code accordingly, e.g. -# by adding "cd .." or using relative paths. - -if ! git update-index -q --ignore-submodules --refresh -then - die "Up-to-date check failed" -fi - -if ! git diff-files --quiet --ignore-submodules -- -then - die "Working directory has unstaged changes" -fi - -# This is a rough translation of: -# -# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX -if git cat-file -e HEAD 2>/dev/null -then - head=HEAD -else - head=$(git hash-object -t tree --stdin &2 - exit 1 -} - -unset GIT_DIR GIT_WORK_TREE -cd "$worktree" && - -if grep -q "^diff --git " "$1" -then - validate_patch "$1" -else - validate_cover_letter "$1" -fi && - -if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL" -then - git config --unset-all sendemail.validateWorktree && - trap 'git worktree remove -ff "$worktree"' EXIT && - validate_series -fi - - -================================================ -File: /.git\hooks\update.sample -================================================ -#!/bin/sh -# -# An example hook script to block unannotated tags from entering. -# Called by "git receive-pack" with arguments: refname sha1-old sha1-new -# -# To enable this hook, rename this file to "update". -# -# Config -# ------ -# hooks.allowunannotated -# This boolean sets whether unannotated tags will be allowed into the -# repository. By default they won't be. -# hooks.allowdeletetag -# This boolean sets whether deleting tags will be allowed in the -# repository. By default they won't be. -# hooks.allowmodifytag -# This boolean sets whether a tag may be modified after creation. By default -# it won't be. -# hooks.allowdeletebranch -# This boolean sets whether deleting branches will be allowed in the -# repository. By default they won't be. -# hooks.denycreatebranch -# This boolean sets whether remotely creating branches will be denied -# in the repository. By default this is allowed. -# - -# --- Command line -refname="$1" -oldrev="$2" -newrev="$3" - -# --- Safety check -if [ -z "$GIT_DIR" ]; then - echo "Don't run this script from the command line." >&2 - echo " (if you want, you could supply GIT_DIR then run" >&2 - echo " $0 )" >&2 - exit 1 -fi - -if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then - echo "usage: $0 " >&2 - exit 1 -fi - -# --- Config -allowunannotated=$(git config --type=bool hooks.allowunannotated) -allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch) -denycreatebranch=$(git config --type=bool hooks.denycreatebranch) -allowdeletetag=$(git config --type=bool hooks.allowdeletetag) -allowmodifytag=$(git config --type=bool hooks.allowmodifytag) - -# check for no description -projectdesc=$(sed -e '1q' "$GIT_DIR/description") -case "$projectdesc" in -"Unnamed repository"* | "") - echo "*** Project description file hasn't been set" >&2 - exit 1 - ;; -esac - -# --- Check types -# if $newrev is 0000...0000, it's a commit to delete a ref. -zero=$(git hash-object --stdin &2 - echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2 - exit 1 - fi - ;; - refs/tags/*,delete) - # delete tag - if [ "$allowdeletetag" != "true" ]; then - echo "*** Deleting a tag is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/tags/*,tag) - # annotated tag - if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1 - then - echo "*** Tag '$refname' already exists." >&2 - echo "*** Modifying a tag is not allowed in this repository." >&2 - exit 1 - fi - ;; - refs/heads/*,commit) - # branch - if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then - echo "*** Creating a branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/heads/*,delete) - # delete branch - if [ "$allowdeletebranch" != "true" ]; then - echo "*** Deleting a branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - refs/remotes/*,commit) - # tracking branch - ;; - refs/remotes/*,delete) - # delete tracking branch - if [ "$allowdeletebranch" != "true" ]; then - echo "*** Deleting a tracking branch is not allowed in this repository" >&2 - exit 1 - fi - ;; - *) - # Anything else (is there anything else?) - echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2 - exit 1 - ;; -esac - -# --- Finished -exit 0 - - -================================================ -File: /.git\info\exclude -================================================ -# git ls-files --others --exclude-from=.git/info/exclude -# Lines that start with '#' are comments. -# For a project mostly in C, the following would be a good set of -# exclude patterns (uncomment them if you want to use them): -# *.[oa] -# *~ - - -================================================ -File: /.git\logs\HEAD -================================================ -0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git - - -================================================ -File: /.git\logs\refs\heads\main -================================================ -0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git - - -================================================ -File: /.git\logs\refs\remotes\origin\HEAD -================================================ -0000000000000000000000000000000000000000 38f59ba682df7942661d0907e68d38eab52e2d74 Dai Hung PHAM 1735292676 +0100 clone: from https://github.com/cyclotruc/gitingest.git - - -================================================ -File: /.git\packed-refs -================================================ -# pack-refs with: peeled fully-peeled sorted -38f59ba682df7942661d0907e68d38eab52e2d74 refs/remotes/origin/main - - -================================================ -File: /.git\refs\heads\main -================================================ -38f59ba682df7942661d0907e68d38eab52e2d74 - - -================================================ -File: /.git\refs\remotes\origin\HEAD -================================================ -ref: refs/remotes/origin/main - - -================================================ -File: /.github\workflows\unitest.yml -================================================ -name: Unit Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-asyncio - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . - - - name: Run tests - run: | - pytest - -================================================ -File: /.gitignore -================================================ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -tmp/* - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ -.vscode/settings.json -.DS_Store - -# Project specific -history.txt -cleanup.py -Caddyfile - -# ignore default output directory -tmp/* - - -================================================ -File: /CODE_OF_CONDUCT.md -================================================ -# Contributor Covenant Code of Conduct - -## Our Pledge - -We as members, contributors, and leaders pledge to make participation in our -community a harassment-free experience for everyone, regardless of age, body -size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, -nationality, personal appearance, race, religion, or sexual identity -and orientation. - -We pledge to act and interact in ways that contribute to an open, welcoming, -diverse, inclusive, and healthy community. - -## Our Standards - -Examples of behavior that contributes to a positive environment for our -community include: - -* Demonstrating empathy and kindness toward other people -* Being respectful of differing opinions, viewpoints, and experiences -* Giving and gracefully accepting constructive feedback -* Accepting responsibility and apologizing to those affected by our mistakes, - and learning from the experience -* Focusing on what is best not just for us as individuals, but for the - overall community - -Examples of unacceptable behavior include: - -* The use of sexualized language or imagery, and sexual attention or - advances of any kind -* Trolling, insulting or derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or email - address, without their explicit permission -* Other conduct which could reasonably be considered inappropriate in a - professional setting - -## Enforcement Responsibilities - -Community leaders are responsible for clarifying and enforcing our standards of -acceptable behavior and will take appropriate and fair corrective action in -response to any behavior that they deem inappropriate, threatening, offensive, -or harmful. - -Community leaders have the right and responsibility to remove, edit, or reject -comments, commits, code, wiki edits, issues, and other contributions that are -not aligned to this Code of Conduct, and will communicate reasons for moderation -decisions when appropriate. - -## Scope - -This Code of Conduct applies within all community spaces, and also applies when -an individual is officially representing the community in public spaces. -Examples of representing our community include using an official e-mail address, -posting via an official social media account, or acting as an appointed -representative at an online or offline event. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported to the community leaders responsible for enforcement at -romain@coderamp.io. -All complaints will be reviewed and investigated promptly and fairly. - -All community leaders are obligated to respect the privacy and security of the -reporter of any incident. - -## Enforcement Guidelines - -Community leaders will follow these Community Impact Guidelines in determining -the consequences for any action they deem in violation of this Code of Conduct: - -### 1. Correction - -**Community Impact**: Use of inappropriate language or other behavior deemed -unprofessional or unwelcome in the community. - -**Consequence**: A private, written warning from community leaders, providing -clarity around the nature of the violation and an explanation of why the -behavior was inappropriate. A public apology may be requested. - -### 2. Warning - -**Community Impact**: A violation through a single incident or series -of actions. - -**Consequence**: A warning with consequences for continued behavior. No -interaction with the people involved, including unsolicited interaction with -those enforcing the Code of Conduct, for a specified period of time. This -includes avoiding interactions in community spaces as well as external channels -like social media. Violating these terms may lead to a temporary or -permanent ban. - -### 3. Temporary Ban - -**Community Impact**: A serious violation of community standards, including -sustained inappropriate behavior. - -**Consequence**: A temporary ban from any sort of interaction or public -communication with the community for a specified period of time. No public or -private interaction with the people involved, including unsolicited interaction -with those enforcing the Code of Conduct, is allowed during this period. -Violating these terms may lead to a permanent ban. - -### 4. Permanent Ban - -**Community Impact**: Demonstrating a pattern of violation of community -standards, including sustained inappropriate behavior, harassment of an -individual, or aggression toward or disparagement of classes of individuals. - -**Consequence**: A permanent ban from any sort of public interaction within -the community. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], -version 2.0, available at -https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. - -Community Impact Guidelines were inspired by [Mozilla's code of conduct -enforcement ladder](https://github.com/mozilla/diversity). - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see the FAQ at -https://www.contributor-covenant.org/faq. Translations are available at -https://www.contributor-covenant.org/translations. - - -================================================ -File: /Dockerfile -================================================ -# Build stage -FROM python:3.12-slim AS builder - -WORKDIR /build - -# Copy requirements first to leverage Docker cache -COPY requirements.txt . - -# Install build dependencies and Python packages -RUN apt-get update \ - && apt-get install -y --no-install-recommends gcc python3-dev \ - && pip install --no-cache-dir --upgrade pip \ - && pip install --no-cache-dir --timeout 1000 -r requirements.txt \ - && rm -rf /var/lib/apt/lists/* - -# Runtime stage -FROM python:3.12-slim - -# Set Python environment variables -ENV PYTHONUNBUFFERED=1 -ENV PYTHONDONTWRITEBYTECODE=1 - -# Install git -RUN apt-get update \ - && apt-get install -y --no-install-recommends git \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /app - -# Create a non-root user -RUN useradd -m -u 1000 appuser - -COPY --from=builder /usr/local/lib/python3.12/site-packages/ /usr/local/lib/python3.12/site-packages/ -COPY src/ ./ - -# Change ownership of the application files -RUN chown -R appuser:appuser /app - -# Switch to non-root user -USER appuser - -EXPOSE 8000 - -CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] - - -================================================ -File: /LICENSE -================================================ -MIT License - -Copyright (c) 2024 Romain Courtois - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - - -================================================ -File: /pytest.ini -================================================ -[pytest] -pythonpath = src -testpaths = src/gitingest/tests -asyncio_mode = auto - - -python_files = test_*.py -python_classes = Test* -python_functions = test_* - -================================================ -File: /requirements.txt -================================================ -fastapi[standard] -uvicorn -fastapi-analytics -slowapi -tiktoken -pytest -pytest-asyncio -click>=8.0.0 - - -================================================ -File: /SECURITY.md -================================================ -# Security Policy - -## Reporting a Vulnerability - -If you have discovered a vulnerability inside the project, report it privately at romain@coderamp.io. This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. - - -================================================ -File: /setup.py -================================================ -from setuptools import setup, find_packages - -setup( - name="gitingest", - version="0.1.2", - packages=find_packages(where="src"), - package_dir={"": "src"}, - include_package_data=True, - install_requires=[ - "click>=8.0.0", - "tiktoken", - ], - entry_points={ - "console_scripts": [ - "gitingest=gitingest.cli:main", - ], - }, - python_requires=">=3.6", - author="Romain Courtois", - author_email="romain@coderamp.io", - description="CLI tool to analyze and create text dumps of codebases for LLMs", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/cyclotruc/gitingest", - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3", - ], -) - -================================================ -File: /src\config.py -================================================ -MAX_DISPLAY_SIZE = 300000 -TMP_BASE_PATH = "../tmp" - -EXAMPLE_REPOS = [ - {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, - {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, - {"name": "Flask", "url": "https://github.com/pallets/flask"}, - {"name": "Tldraw", "url": "https://github.com/tldraw/tldraw"}, - {"name": "ApiAnalytics", "url": "https://github.com/tom-draper/api-analytics"}, -] - - -================================================ -File: /src\gitingest\cli.py -================================================ -import os -import pathlib -import click -import sys -from .encoding import setup_encoding - -# Setup encoding first -setup_encoding() - -# Define constants -MAX_FILE_SIZE = 51200 # 50KB by default -DEFAULT_IGNORE_PATTERNS = [] - -def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() - pattern = pattern.lstrip(os.sep) - if pattern.endswith(os.sep): - pattern += "*" - return pattern - -@click.command() -@click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, - help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, - help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, - help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, - help='Patterns to include') -def main(source, output, max_size, exclude_pattern, include_pattern): - """Analyze a directory and create a text dump of its contents.""" - try: - from gitingest.ingest import ingest - - # Convert paths to absolute with proper encoding - source = str(pathlib.Path(source).resolve()) - - # Handle patterns - exclude_patterns = list(exclude_pattern) - include_patterns = list(set(include_pattern)) - - # Set default output name - if not output: - output = "digest.txt" - output = str(pathlib.Path(output).resolve()) - - # Call ingest with encoding awareness - summary, tree, content = ingest( - source, - max_size, - include_patterns, - exclude_patterns, - output=output - ) - - # Write output with explicit encoding - with open(output, 'w', encoding='utf-8', errors='replace') as f: - if isinstance(summary, bytes): - summary = summary.decode('utf-8', errors='replace') - if isinstance(tree, bytes): - tree = tree.decode('utf-8', errors='replace') - if isinstance(content, bytes): - content = content.decode('utf-8', errors='replace') - - f.write(f"{summary}\n\n{tree}\n\n{content}") - - # Print messages with encoding handling - click.echo(f"Analysis complete! Output written to: {output}") - click.echo("\nSummary:") - click.echo(summary) - - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - raise click.Abort() - -if __name__ == '__main__': - main() - -================================================ -File: /src\gitingest\clone.py -================================================ -import asyncio -from typing import Tuple - -from gitingest.utils import async_timeout - -CLONE_TIMEOUT = 20 - -async def check_repo_exists(url: str) -> bool: - proc = await asyncio.create_subprocess_exec( - "curl", - "-I", - url, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - return False - # Check if stdout contains "404" status code - stdout_str = stdout.decode() - return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str - -@async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: dict) -> str: - if not await check_repo_exists(query['url']): - raise ValueError("Repository not found, make sure it is public") - - if query['commit']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - - proc = await asyncio.create_subprocess_exec( - "git", - "-C", - query['local_path'], - "checkout", - query['branch'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - "--branch", - query['branch'], - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - else: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - - stdout, stderr = await proc.communicate() - - return stdout, stderr - -================================================ -File: /src\gitingest\encoding.py -================================================ -import sys -import io -import codecs - -def setup_encoding(): - if sys.stdout.encoding != 'utf-8': - sys.stdout = io.TextIOWrapper( - sys.stdout.buffer, - encoding='utf-8', - errors='replace' - ) - if sys.stderr.encoding != 'utf-8': - sys.stderr = io.TextIOWrapper( - sys.stderr.buffer, - encoding='utf-8', - errors='replace' - ) - -================================================ -File: /src\gitingest\ingest.py -================================================ -import asyncio -import shutil -from typing import Union, List -from pathlib import Path -import io -import sys - -# Import other modules from the package -from gitingest.parse_query import parse_query -from gitingest.clone import clone_repo -from gitingest.ingest_from_query import ingest_from_query - -def setup_encoding(): - if sys.stdout.encoding != 'utf-8': - sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') - if sys.stderr.encoding != 'utf-8': - sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace') - -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, - include_patterns: Union[List[str], str] = None, - exclude_patterns: Union[List[str], str] = None, - output: str = None) -> tuple[str, str, str]: - """ - Analyze and create a text dump of source contents. - - Args: - source: Path to source directory or git URL - max_file_size: Maximum file size to process in bytes - include_patterns: Patterns to include in analysis - exclude_patterns: Patterns to exclude from analysis - output: Output file path - - Returns: - Tuple of (summary, tree, content) - """ - setup_encoding() - query = None - - try: - query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) - if query['url']: - asyncio.run(clone_repo(query)) - - summary, tree, content = ingest_from_query(query) - - if output: - # Write with explicit UTF-8 encoding - with open(output, "w", encoding='utf-8', errors='replace') as f: - # Ensure all content is properly encoded - tree = tree.encode('utf-8', errors='replace').decode('utf-8') if isinstance(tree, str) else tree - content = content.encode('utf-8', errors='replace').decode('utf-8') if isinstance(content, str) else content - f.write(f"{tree}\n{content}") - - return summary, tree, content - - except UnicodeEncodeError as e: - # Handle encoding errors specifically - error_msg = f"Encoding error while processing {source}: {str(e)}" - raise RuntimeError(error_msg) - - except Exception as e: - # Handle other errors - error_msg = f"Error while processing {source}: {str(e)}" - raise RuntimeError(error_msg) - - finally: - # Clean up the temporary directory if it was created - if query and query.get('url'): - # Get parent directory two levels up from local_path (../tmp) - cleanup_path = str(Path(query['local_path']).parents[1]) - try: - shutil.rmtree(cleanup_path, ignore_errors=True) - except Exception as e: - print(f"Warning: Could not clean up temporary directory: {str(e)}", file=sys.stderr) - -================================================ -File: /src\gitingest\ingest_from_query.py -================================================ -import os -from fnmatch import fnmatch -from typing import Dict, List, Union -import tiktoken - - -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB -MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500MB - - -def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: - rel_path = path.replace(base_path, "").lstrip(os.sep) - include = False - for pattern in include_patterns: - if fnmatch(rel_path, pattern): - include = True - return include - -def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: - rel_path = path.replace(base_path, "").lstrip(os.sep) - for pattern in ignore_patterns: - if pattern == '': - continue - if fnmatch(rel_path, pattern): - return True - return False - -def is_safe_symlink(symlink_path: str, base_path: str) -> bool: - """Check if a symlink points to a location within the base directory.""" - try: - target_path = os.path.realpath(symlink_path) - base_path = os.path.realpath(base_path) - return os.path.commonpath([target_path, base_path]) == base_path - except (OSError, ValueError): - # If there's any error resolving the paths, consider it unsafe - return False - -def is_text_file(file_path: str) -> bool: - """Determines if a file is likely a text file based on its content.""" - try: - with open(file_path, 'rb') as file: - chunk = file.read(1024) - return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) - except IOError: - return False - -def read_file_content(file_path: str) -> str: - try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: - return f.read() - except Exception as e: - return f"Error reading file: {str(e)}" - -def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict: - """Recursively analyzes a directory and its contents with safety limits.""" - if seen_paths is None: - seen_paths = set() - if stats is None: - stats = {"total_files": 0, "total_size": 0} - - if depth > MAX_DIRECTORY_DEPTH: - print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)") - return None - - if stats["total_files"] >= MAX_FILES: - print(f"Skipping further processing: maximum file limit ({MAX_FILES}) reached") - return None - - if stats["total_size"] >= MAX_TOTAL_SIZE_BYTES: - print(f"Skipping further processing: maximum total size ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached") - return None - - real_path = os.path.realpath(path) - if real_path in seen_paths: - print(f"Skipping already visited path: {path}") - return None - seen_paths.add(real_path) - - result = { - "name": os.path.basename(path), - "type": "directory", - "size": 0, - "children": [], - "file_count": 0, - "dir_count": 0, - "path": path, - "ignore_content": False - } - - ignore_patterns = query['ignore_patterns'] - base_path = query['local_path'] - include_patterns = query['include_patterns'] - - try: - for item in os.listdir(path): - item_path = os.path.join(path, item) - - if should_exclude(item_path, base_path, ignore_patterns): - continue - - is_file = os.path.isfile(item_path) - if is_file and query['include_patterns']: - if not should_include(item_path, base_path, include_patterns): - result["ignore_content"] = True - continue - - # Handle symlinks - if os.path.islink(item_path): - if not is_safe_symlink(item_path, base_path): - print(f"Skipping symlink that points outside base directory: {item_path}") - continue - real_path = os.path.realpath(item_path) - if real_path in seen_paths: - print(f"Skipping already visited symlink target: {item_path}") - continue - - if os.path.isfile(real_path): - file_size = os.path.getsize(real_path) - if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {item_path}: would exceed total size limit") - continue - - stats["total_files"] += 1 - stats["total_size"] += file_size - - if stats["total_files"] > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return result - - is_text = is_text_file(real_path) - content = read_file_content(real_path) if is_text else "[Non-text file]" - - child = { - "name": item, - "type": "file", - "size": file_size, - "content": content, - "path": item_path - } - result["children"].append(child) - result["size"] += file_size - result["file_count"] += 1 - - elif os.path.isdir(real_path): - subdir = scan_directory(real_path, query, seen_paths, depth + 1, stats) - if subdir and (not include_patterns or subdir["file_count"] > 0): - subdir["name"] = item - subdir["path"] = item_path - result["children"].append(subdir) - result["size"] += subdir["size"] - result["file_count"] += subdir["file_count"] - result["dir_count"] += 1 + subdir["dir_count"] - continue - - if os.path.isfile(item_path): - file_size = os.path.getsize(item_path) - if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {item_path}: would exceed total size limit") - continue - - stats["total_files"] += 1 - stats["total_size"] += file_size - - if stats["total_files"] > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return result - - is_text = is_text_file(item_path) - content = read_file_content(item_path) if is_text else "[Non-text file]" - - child = { - "name": item, - "type": "file", - "size": file_size, - "content": content, - "path": item_path - } - result["children"].append(child) - result["size"] += file_size - result["file_count"] += 1 - - elif os.path.isdir(item_path): - subdir = scan_directory(item_path, query, seen_paths, depth + 1, stats) - if subdir and (not include_patterns or subdir["file_count"] > 0): - result["children"].append(subdir) - result["size"] += subdir["size"] - result["file_count"] += subdir["file_count"] - result["dir_count"] += 1 + subdir["dir_count"] - - except PermissionError: - print(f"Permission denied: {path}") - - return result - -def extract_files_content(query: dict, node: Dict, max_file_size: int, files: List = None) -> List[Dict]: - """Recursively collects all text files with their contents.""" - if files is None: - files = [] - - if node["type"] == "file" and node["content"] != "[Non-text file]": - content = node["content"] - if node["size"] > max_file_size: - content = None - - files.append({ - "path": node["path"].replace(query['local_path'], ""), - "content": content, - "size": node["size"] - }) - elif node["type"] == "directory": - for child in node["children"]: - extract_files_content(query, child, max_file_size, files) - return files - -def create_file_content_string(files: List[Dict]) -> str: - """Creates a formatted string of file contents with separators.""" - output = "" - separator = "=" * 48 + "\n" - - # First add README.md if it exists - for file in files: - if not file['content']: - continue - if file['path'].lower() == '/readme.md': - output += separator - output += f"File: {file['path']}\n" - output += separator - output += f"{file['content']}\n\n" - break - - # Then add all other files in their original order - for file in files: - if not file['content'] or file['path'].lower() == '/readme.md': - continue - output += separator - output += f"File: {file['path']}\n" - output += separator - output += f"{file['content']}\n\n" - - return output - -def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: - """Creates a summary string with file counts and content size.""" - if "user_name" in query: - summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" - else: - summary = f"Repository: {query['slug']}\n" - summary += f"Files analyzed: {nodes['file_count']}\n" - - if 'subpath' in query and query['subpath'] != '/': - summary += f"Subpath: {query['subpath']}\n" - if 'commit' in query and query['commit']: - summary += f"Commit: {query['commit']}\n" - elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: - summary += f"Branch: {query['branch']}\n" - return summary - -def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bool = True) -> str: - """Creates a tree-like string representation of the file structure.""" - tree = "" - if not node["name"]: - node["name"] = query['slug'] - - if node["name"]: - current_prefix = "└── " if is_last else "├── " - name = node["name"] + "/" if node["type"] == "directory" else node["name"] - tree += prefix + current_prefix + name + "\n" - if node["type"] == "directory": - # Adjust prefix only if we added a node name - new_prefix = prefix + (" " if is_last else "│ ") if node["name"] else prefix - children = node["children"] - for i, child in enumerate(children): - tree += create_tree_structure(query, child, new_prefix, i == len(children) - 1) - - return tree - -def generate_token_string(context_string: str) -> str: - """Returns the number of tokens in a text string.""" - formatted_tokens = "" - try: - encoding = tiktoken.get_encoding("cl100k_base", ) - total_tokens = len(encoding.encode(context_string, disallowed_special=())) - - except Exception as e: - print(e) - return None - if total_tokens > 1000000: - formatted_tokens = f"{total_tokens/1000000:.1f}M" - elif total_tokens > 1000: - formatted_tokens = f"{total_tokens/1000:.1f}k" - else: - formatted_tokens = f"{total_tokens}" - return formatted_tokens - -def ingest_single_file(path: str, query: dict) -> Dict: - if not os.path.isfile(path): - raise ValueError(f"Path {path} is not a file") - - file_size = os.path.getsize(path) - is_text = is_text_file(path) - if not is_text: - raise ValueError(f"File {path} is not a text file") - - content = read_file_content(path) - if file_size > query['max_file_size']: - content = "[Content ignored: file too large]" - - file_info = { - "path": path.replace(query['local_path'], ""), - "content": content, - "size": file_size - } - - summary = ( - f"Repository: {query['user_name']}/{query['repo_name']}\n" - f"File: {os.path.basename(path)}\n" - f"Size: {file_size:,} bytes\n" - f"Lines: {len(content.splitlines()):,}\n" - ) - - files_content = create_file_content_string([file_info]) - tree = "Directory structure:\n└── " + os.path.basename(path) - - formatted_tokens = generate_token_string(files_content) - if formatted_tokens: - summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) - -def ingest_directory(path: str, query: dict) -> Dict: - nodes = scan_directory(path, query) - files = extract_files_content(query, nodes, query['max_file_size']) - summary = create_summary_string(query, nodes, files) - tree = "Directory structure:\n" + create_tree_structure(query, nodes) - files_content = create_file_content_string(files) - - formatted_tokens = generate_token_string(tree + files_content) - if formatted_tokens: - summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) - -def ingest_from_query(query: dict) -> Dict: - """Main entry point for analyzing a codebase directory or single file.""" - path = f"{query['local_path']}{query['subpath']}" - if not os.path.exists(path): - raise ValueError(f"{query['slug']} cannot be found") - - if query.get('type') == 'blob': - return ingest_single_file(path, query) - else: - return ingest_directory(path, query) - - - -================================================ -File: /src\gitingest\parse_query.py -================================================ -from typing import List, Union -import uuid -import os - - -DEFAULT_IGNORE_PATTERNS = [ - # Python - '*.pyc', '*.pyo', '*.pyd', '__pycache__', '.pytest_cache', '.coverage', - '.tox', '.nox', '.mypy_cache', '.ruff_cache', '.hypothesis', - 'poetry.lock', 'Pipfile.lock', - - # JavaScript/Node - 'node_modules', 'bower_components', 'package-lock.json', 'yarn.lock', - '.npm', '.yarn', '.pnpm-store', - - # Version control - '.git', '.svn', '.hg', '.gitignore', '.gitattributes', '.gitmodules', - - # Images and media - '*.svg', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.pdf', - '*.mov', '*.mp4', '*.mp3', '*.wav', - - # Virtual environments - 'venv', '.venv', 'env', '.env', 'virtualenv', - - # IDEs and editors - '.idea', '.vscode', '.vs', '*.swp', '*.swo', '*.swn', - '.settings', '.project', '.classpath', '*.sublime-*', - - # Temporary and cache files - '*.log', '*.bak', '*.swp', '*.tmp', '*.temp', - '.cache', '.sass-cache', '.eslintcache', - '.DS_Store', 'Thumbs.db', 'desktop.ini', - - # Build directories and artifacts - 'build', 'dist', 'target', 'out', - '*.egg-info', '*.egg', '*.whl', - '*.so', '*.dylib', '*.dll', '*.class', - - # Documentation - 'site-packages', '.docusaurus', '.next', '.nuxt', - - # Other common patterns - '*.min.js', '*.min.css', # Minified files - '*.map', # Source maps - '.terraform', '*.tfstate*', # Terraform - 'vendor/', # Dependencies in various languages -] - -TMP_BASE_PATH = "../tmp" - -def parse_url(url: str) -> dict: - parsed = { - "user_name": None, - "repo_name": None, - "type": None, - "branch": None, - "commit": None, - "subpath": "/", - "local_path": None, - "url": None, - "slug": None, - "id": None, - } - - url = url.split(" ")[0] - if not url.startswith('https://'): - url = 'https://' + url - - # Extract domain and path - url_parts = url.split('/') - domain = url_parts[2] - path_parts = url_parts[3:] - - if len(path_parts) < 2: - raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - - parsed["user_name"] = path_parts[0] - parsed["repo_name"] = path_parts[1] - - # Keep original URL format - parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" - parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" - parsed["id"] = str(uuid.uuid4()) - parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" - - if len(path_parts) > 3: - parsed["type"] = path_parts[2] - parsed["branch"] = path_parts[3] - if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): - parsed["commit"] = parsed['branch'] - - parsed["subpath"] = "/" + "/".join(path_parts[4:]) - return parsed - -def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() - pattern = pattern.lstrip(os.sep) - if pattern.endswith(os.sep): - pattern += "*" - return pattern - -def parse_patterns(pattern: Union[List[str], str]) -> List[str]: - if isinstance(pattern, list): - pattern = ",".join(pattern) - - for p in pattern.split(","): - if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") - patterns = [normalize_pattern(p) for p in pattern.split(",")] - return patterns - -def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: - for pattern in include_patterns: - if pattern in ignore_patterns: - ignore_patterns.remove(pattern) - return ignore_patterns - - -def parse_path(path: str) -> dict: - - query = { - "local_path": os.path.abspath(path), - "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), - "subpath": "/", - "id": str(uuid.uuid4()), - "url": None, - } - return query - -def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: - if from_web: - query = parse_url(source) - else: - if source.startswith("https://") or "github.com" in source: - query = parse_url(source) - else: - query = parse_path(source) - query['max_file_size'] = max_file_size - - if ignore_patterns and ignore_patterns != "": - ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) - else: - ignore_patterns = DEFAULT_IGNORE_PATTERNS - - if include_patterns and include_patterns != "": - include_patterns = parse_patterns(include_patterns) - ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: - include_patterns = None - - query['ignore_patterns'] = ignore_patterns - query['include_patterns'] = include_patterns - - return query - - - -================================================ -File: /src\gitingest\tests\conftest.py -================================================ -import os -import sys - -# Get the absolute path of the project root directory (one level up from tests) -project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -# Add both the project root and src directory to PYTHONPATH -sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) - -================================================ -File: /src\gitingest\tests\test_clone.py -================================================ -import pytest -from clone import clone_repo, check_repo_exists -from unittest.mock import patch, AsyncMock - -@pytest.mark.asyncio -async def test_clone_repo_with_commit(): - query = { - 'commit': 'a' * 40, # Simulating a valid commit hash - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' - } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') - mock_exec.return_value = mock_process - - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) - assert mock_exec.call_count == 2 # Clone and checkout calls - -@pytest.mark.asyncio -async def test_clone_repo_without_commit(): - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' - } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') - mock_exec.return_value = mock_process - - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) - assert mock_exec.call_count == 1 # Only clone call - -@pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository(): - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo' - } - - with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: - with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) - -@pytest.mark.asyncio -async def test_check_repo_exists(): - url = "https://github.com/user/repo" - - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: - mock_process = AsyncMock() - mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') - mock_exec.return_value = mock_process - - # Test existing repository - mock_process.returncode = 0 - assert await check_repo_exists(url) is True - - # Test non-existing repository (404 response) - mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') - mock_process.returncode = 0 - assert await check_repo_exists(url) is False - - # Test failed request - mock_process.returncode = 1 - assert await check_repo_exists(url) is False - -================================================ -File: /src\gitingest\tests\test_ingest.py -================================================ -import pytest -from src.gitingest.ingest_from_query import ( - scan_directory, - extract_files_content, -) - -# Test fixtures -@pytest.fixture -def sample_query(): - return { - 'user_name': 'test_user', - 'repo_name': 'test_repo', - 'local_path': '/tmp/test_repo', - 'subpath': '/', - 'branch': 'main', - 'commit': None, - 'max_file_size': 1000000, - 'slug': 'test_user/test_repo', - 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], - 'include_patterns': None, - 'pattern_type': 'exclude' - - } - -@pytest.fixture -def temp_directory(tmp_path): - # Creates the following structure: - # test_repo/ - # ├── file1.txt - # ├── file2.py - # └── src/ - # | ├── subfile1.txt - # | └── subfile2.py - # | └── subdir/ - # | └── file_subdir.txt - # | └── file_subdir.py - # └── dir1/ - # | └── file_dir1.txt - # └── dir2/ - # └── file_dir2.txt - - test_dir = tmp_path / "test_repo" - test_dir.mkdir() - - # Root files - (test_dir / "file1.txt").write_text("Hello World") - (test_dir / "file2.py").write_text("print('Hello')") - - # src directory and its files - src_dir = test_dir / "src" - src_dir.mkdir() - (src_dir / "subfile1.txt").write_text("Hello from src") - (src_dir / "subfile2.py").write_text("print('Hello from src')") - - # src/subdir and its files - subdir = src_dir / "subdir" - subdir.mkdir() - (subdir / "file_subdir.txt").write_text("Hello from subdir") - (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - - # dir1 and its file - dir1 = test_dir / "dir1" - dir1.mkdir() - (dir1 / "file_dir1.txt").write_text("Hello from dir1") - - # dir2 and its file - dir2 = test_dir / "dir2" - dir2.mkdir() - (dir2 / "file_dir2.txt").write_text("Hello from dir2") - - return test_dir - -def test_scan_directory(temp_directory, sample_query): - result = scan_directory( - str(temp_directory), - query=sample_query - ) - - assert result['type'] == 'directory' - assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 - assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 - -def test_extract_files_content(temp_directory, sample_query): - nodes = scan_directory( - str(temp_directory), - query=sample_query - ) - - files = extract_files_content(sample_query, nodes, max_file_size=1000000) - assert len(files) == 8 # All .txt and .py files - - # Check for presence of key files - paths = [f['path'] for f in files] - assert any('file1.txt' in p for p in paths) - assert any('subfile1.txt' in p for p in paths) - assert any('file2.py' in p for p in paths) - assert any('subfile2.py' in p for p in paths) - assert any('file_subdir.txt' in p for p in paths) - assert any('file_dir1.txt' in p for p in paths) - assert any('file_dir2.txt' in p for p in paths) - - - -# TODO: test with include patterns: ['*.txt'] -# TODO: test with wrong include patterns: ['*.qwerty'] - - -#single folder patterns -# TODO: test with include patterns: ['src/*'] -# TODO: test with include patterns: ['/src/*'] -# TODO: test with include patterns: ['/src/'] -# TODO: test with include patterns: ['/src*'] - -#multiple patterns -# TODO: test with multiple include patterns: ['*.txt', '*.py'] -# TODO: test with multiple include patterns: ['/src/*', '*.txt'] -# TODO: test with multiple include patterns: ['/src*', '*.txt'] - - - - - - -================================================ -File: /src\gitingest\tests\test_parse_query.py -================================================ -import pytest -from gitingest.parse_query import parse_query, parse_url, DEFAULT_IGNORE_PATTERNS - - -def test_parse_url_valid(): - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo" - ] - for url in test_cases: - result = parse_url(url) - assert result["user_name"] == "user" - assert result["repo_name"] == "repo" - assert result["url"] == url - -def test_parse_url_invalid(): - url = "https://only-domain.com" - with pytest.raises(ValueError, match="Invalid repository URL"): - parse_url(url) - -def test_parse_query_basic(): - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo" - ] - for url in test_cases: - result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') - assert result["user_name"] == "user" - assert result["repo_name"] == "repo" - assert result["url"] == url - assert "*.txt" in result["ignore_patterns"] - -def test_parse_query_include_pattern(): - url = "https://github.com/user/repo" - result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') - assert result["include_patterns"] == ["*.py"] - assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS - -def test_parse_query_invalid_pattern(): - url = "https://github.com/user/repo" - with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') - -================================================ -File: /src\gitingest\utils.py -================================================ - -## Async Timeout decorator -import asyncio -import functools -from typing import TypeVar, Callable - -T = TypeVar("T") - -class AsyncTimeoutError(Exception): - """Raised when an async operation exceeds its timeout limit.""" - pass - -def async_timeout(seconds: int = 10): - def decorator(func: Callable[..., T]) -> Callable[..., T]: - @functools.wraps(func) - async def wrapper(*args, **kwargs) -> T: - try: - return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) - except asyncio.TimeoutError: - raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") - return wrapper - return decorator - -================================================ -File: /src\gitingest\__init__.py -================================================ -from .ingest import ingest -from .parse_query import parse_query -from .clone import clone_repo -from .ingest_from_query import ingest_from_query - -__all__ = ['ingest', 'parse_query', 'clone_repo', 'ingest_from_query'] - -================================================ -File: /src\main.py -================================================ -import os -from dotenv import load_dotenv - -from fastapi import FastAPI, Request -from fastapi.templating import Jinja2Templates -from fastapi.responses import HTMLResponse, FileResponse, Response -from fastapi.staticfiles import StaticFiles -from starlette.middleware.trustedhost import TrustedHostMiddleware -from api_analytics.fastapi import Analytics -from slowapi import _rate_limit_exceeded_handler -from slowapi.errors import RateLimitExceeded - -from server_utils import limiter -from routers import download, dynamic, index - - -load_dotenv() - -app = FastAPI() -app.state.limiter = limiter -app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) - -app.mount("/static", StaticFiles(directory="static"), name="static") -app.add_middleware(Analytics, api_key=os.getenv('API_ANALYTICS_KEY')) - -# Define the default allowed hosts -default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] - -# Fetch allowed hosts from the environment variable or use the default -allowed_hosts = os.getenv("ALLOWED_HOSTS") -if allowed_hosts: - allowed_hosts = allowed_hosts.split(",") -else: - allowed_hosts = default_allowed_hosts - -app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) -templates = Jinja2Templates(directory="templates") - -@app.get("/health") -async def health_check(): - return {"status": "healthy"} - -@app.head("/") -async def head_root(): - """Mirror the headers and status code of the index page""" - return HTMLResponse( - content=None, - headers={ - "content-type": "text/html; charset=utf-8" - } - ) - -@app.get("/api/", response_class=HTMLResponse) -@app.get("/api", response_class=HTMLResponse) -async def api_docs(request: Request): - return templates.TemplateResponse( - "api.jinja", {"request": request} - ) - -@app.get("/robots.txt") -async def robots(): - return FileResponse('static/robots.txt') - -app.include_router(index) -app.include_router(download) -app.include_router(dynamic) - -================================================ -File: /src\process_query.py -================================================ -from typing import List -from fastapi.templating import Jinja2Templates -from fastapi import Request - -from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS -from gitingest import ingest_from_query, clone_repo, parse_query -from server_utils import logSliderToSize, Colors - -templates = Jinja2Templates(directory="templates") - -def print_query(query, request, max_file_size, pattern_type, pattern): - print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") - if int(max_file_size/1024) != 50: - print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") - - -def print_error(query, request, e, max_file_size, pattern_type, pattern): - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{e}{Colors.END}") - -def print_success(query, request, max_file_size, pattern_type, pattern, summary): - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - - - -async def process_query(request: Request, input_text: str, slider_position: int, pattern_type: str = "exclude", pattern: str = "", is_index: bool = False) -> str: - template = "index.jinja" if is_index else "github.jinja" - max_file_size = logSliderToSize(slider_position) - if pattern_type == "include": - include_patterns = pattern - exclude_patterns = None - elif pattern_type == "exclude": - exclude_patterns = pattern - include_patterns = None - try: - query = parse_query(input_text, max_file_size, True, include_patterns, exclude_patterns) - await clone_repo(query) - summary, tree, content = ingest_from_query(query) - with open(f"{query['local_path']}.txt", "w") as f: - f.write(tree + "\n" + content) - - - - except Exception as e: - #hack to print error message when query is not defined - if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) - else: - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print(f"{Colors.RED}{e}{Colors.END}") - return templates.TemplateResponse( - template, - { - "request": request, - "github_url": input_text, - "error_message": f"Error: {e}", - "examples": EXAMPLE_REPOS if is_index else [], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - } - ) - - if len(content) > MAX_DISPLAY_SIZE: - content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - print_success(query, request, max_file_size, pattern_type, pattern, summary) - return templates.TemplateResponse( - template, - { - "request": request, - "github_url": input_text, - "result": True, - "summary": summary, - "tree": tree, - "content": content, - "examples": EXAMPLE_REPOS if is_index else [], - "ingest_id": query['id'], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - } - ) - - -================================================ -File: /src\routers\download.py -================================================ -from fastapi import HTTPException, APIRouter -from fastapi.responses import Response -from config import TMP_BASE_PATH -import os - -router = APIRouter() - -@router.get("/download/{digest_id}") -async def download_ingest(digest_id: str): - try: - # Find the first .txt file in the directory - directory = f"{TMP_BASE_PATH}/{digest_id}" - txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] - - if not txt_files: - raise FileNotFoundError("No .txt file found") - - with open(f"{directory}/{txt_files[0]}", "r") as f: - content = f.read() - - return Response( - content=content, - media_type="text/plain", - headers={ - "Content-Disposition": f"attachment; filename={txt_files[0]}" - } - ) - except FileNotFoundError: - raise HTTPException(status_code=404, detail="Digest not found") - -================================================ -File: /src\routers\dynamic.py -================================================ -from fastapi import APIRouter, Request, Form -from fastapi.responses import HTMLResponse -from fastapi.templating import Jinja2Templates - -from process_query import process_query -from server_utils import limiter - -router = APIRouter() -templates = Jinja2Templates(directory="templates") - -@router.get("/{full_path:path}") -async def catch_all(request: Request, full_path: str): - return templates.TemplateResponse( - "github.jinja", - { - "request": request, - "github_url": f"https://github.com/{full_path}", - "loading": True, - "default_file_size": 243 - } - ) - -@router.post("/{full_path:path}", response_class=HTMLResponse) -@limiter.limit("10/minute") -async def process_catch_all( - request: Request, - input_text: str = Form(...), - max_file_size: int = Form(...), - pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=False) - - -================================================ -File: /src\routers\index.py -================================================ -from fastapi import APIRouter, Request, Form -from fastapi.responses import HTMLResponse -from fastapi.templating import Jinja2Templates - -from server_utils import limiter -from process_query import process_query -from config import EXAMPLE_REPOS - - -router = APIRouter() -templates = Jinja2Templates(directory="templates") - - -@router.get("/", response_class=HTMLResponse) -async def home(request: Request): - return templates.TemplateResponse( - "index.jinja", - { - "request": request, - "examples": EXAMPLE_REPOS, - "default_file_size": 243 - } - ) - - -@router.post("/", response_class=HTMLResponse) -@limiter.limit("10/minute") -async def index_post( - request: Request, - input_text: str = Form(...), - max_file_size: int = Form(...), - pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=True) - - - - - - - -================================================ -File: /src\routers\__init__.py -================================================ -from .download import router as download -from .dynamic import router as dynamic -from .index import router as index - -__all__ = ["download", "dynamic", "index"] - -================================================ -File: /src\server_utils.py -================================================ -## Rate Limiter -from slowapi import Limiter -from slowapi.util import get_remote_address -limiter = Limiter(key_func=get_remote_address) - -## Logarithmic slider to file size -import math -def logSliderToSize(position): - """Convert slider position to file size in KB""" - maxp = 500 - minv = math.log(1) - maxv = math.log(102400) - - return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 - -## Color printing utility -class Colors: - """ANSI color codes""" - BLACK = "\033[0;30m" - RED = "\033[0;31m" - GREEN = "\033[0;32m" - BROWN = "\033[0;33m" - BLUE = "\033[0;34m" - PURPLE = "\033[0;35m" - CYAN = "\033[0;36m" - LIGHT_GRAY = "\033[0;37m" - DARK_GRAY = "\033[1;30m" - LIGHT_RED = "\033[1;31m" - LIGHT_GREEN = "\033[1;32m" - YELLOW = "\033[1;33m" - LIGHT_BLUE = "\033[1;34m" - LIGHT_PURPLE = "\033[1;35m" - LIGHT_CYAN = "\033[1;36m" - WHITE = "\033[1;37m" - BOLD = "\033[1m" - FAINT = "\033[2m" - ITALIC = "\033[3m" - UNDERLINE = "\033[4m" - BLINK = "\033[5m" - NEGATIVE = "\033[7m" - CROSSED = "\033[9m" - END = "\033[0m" - - -================================================ -File: /src\static\js\snow.js -================================================ -// Snow effect initialization -function initSnow() { - const snowCanvas = document.getElementById('snow-canvas'); - const ctx = snowCanvas.getContext('2d'); - - // Configure snow - const snowflakes = []; - const maxSnowflakes = 50; - const spawnInterval = 200; - let currentSnowflakes = 0; - let lastSpawnTime = 0; - - // Resize canvas to window size - function resizeCanvas() { - snowCanvas.width = window.innerWidth; - snowCanvas.height = window.innerHeight; - } - - // Initial setup - resizeCanvas(); - window.addEventListener('resize', resizeCanvas); - - // Snowflake class definition - class Snowflake { - constructor() { - this.reset(); - } - - reset() { - this.x = Math.random() * snowCanvas.width; - this.y = 0; - this.size = Math.random() * 3 + 2; - this.speed = Math.random() * 1 + 0.5; - this.wind = Math.random() * 0.5 - 0.25; - } - - update() { - this.y += this.speed; - this.x += this.wind; - - if (this.y > snowCanvas.height) { - this.reset(); - } - } - - draw() { - ctx.save(); - - ctx.shadowColor = 'rgba(0, 0, 0, 0.3)'; - ctx.shadowBlur = 5; - ctx.shadowOffsetX = 2; - ctx.shadowOffsetY = 2; - - ctx.beginPath(); - ctx.arc(this.x, this.y, this.size, 0, Math.PI * 2); - ctx.fillStyle = 'rgba(255, 255, 255, 1)'; - ctx.fill(); - - ctx.strokeStyle = 'rgba(200, 200, 200, 0.8)'; - ctx.lineWidth = 0.5; - ctx.stroke(); - - ctx.restore(); - } - } - - function animate(currentTime) { - ctx.clearRect(0, 0, snowCanvas.width, snowCanvas.height); - - if (currentSnowflakes < maxSnowflakes && currentTime - lastSpawnTime > spawnInterval) { - snowflakes.push(new Snowflake()); - currentSnowflakes++; - lastSpawnTime = currentTime; - } - - snowflakes.forEach(snowflake => { - snowflake.update(); - snowflake.draw(); - }); - - requestAnimationFrame(animate); - } - - requestAnimationFrame(animate); -} - -// Initialize snow when DOM content is loaded -document.addEventListener('DOMContentLoaded', initSnow); - -// Also initialize when the HTMX content is swapped -document.addEventListener('htmx:afterSettle', initSnow); - -================================================ -File: /src\static\js\utils.js -================================================ -// Copy functionality -function copyText(className) { - const textarea = document.querySelector('.' + className); - const button = document.querySelector(`button[onclick="copyText('${className}')"]`); - if (!textarea || !button) return; - - // Copy text - navigator.clipboard.writeText(textarea.value) - .then(() => { - // Store original content - const originalContent = button.innerHTML; - - // Change button content - button.innerHTML = 'Copied!'; - - // Reset after 1 second - setTimeout(() => { - button.innerHTML = originalContent; - }, 1000); - }) - .catch(err => { - // Show error in button - const originalContent = button.innerHTML; - button.innerHTML = 'Failed to copy'; - setTimeout(() => { - button.innerHTML = originalContent; - }, 1000); - }); -} - - -function handleSubmit(event, showLoading = false) { - event.preventDefault(); - const form = event.target || document.getElementById('ingestForm'); - if (!form) return; - - const submitButton = form.querySelector('button[type="submit"]'); - if (!submitButton) return; - - const formData = new FormData(form); - - // Update file size - const slider = document.getElementById('file_size'); - if (slider) { - formData.delete('max_file_size'); - formData.append('max_file_size', slider.value); - } - - // Update pattern type and pattern - const patternType = document.getElementById('pattern_type'); - const pattern = document.getElementById('pattern'); - if (patternType && pattern) { - formData.delete('pattern_type'); - formData.delete('pattern'); - formData.append('pattern_type', patternType.value); - formData.append('pattern', pattern.value); - } - - const originalContent = submitButton.innerHTML; - const currentStars = document.getElementById('github-stars')?.textContent; - - if (showLoading) { - submitButton.disabled = true; - submitButton.innerHTML = ` -
- - - - - Processing... -
- `; - submitButton.classList.add('bg-[#ffb14d]'); - } - - // Submit the form - fetch(form.action, { - method: 'POST', - body: formData - }) - .then(response => response.text()) - .then(html => { - // Store the star count before updating the DOM - const starCount = currentStars; - - - // TEMPORARY SNOW LOGIC // - const parser = new DOMParser(); - const newDoc = parser.parseFromString(html, 'text/html'); - - const existingCanvas = document.getElementById('snow-canvas'); - document.body.innerHTML = newDoc.body.innerHTML; - if (existingCanvas) { - document.body.insertBefore(existingCanvas, document.body.firstChild); - } - // END TEMPORARY SNOW LOGIC // - - // Wait for next tick to ensure DOM is updated - setTimeout(() => { - // Reinitialize slider functionality - initializeSlider(); - - const starsElement = document.getElementById('github-stars'); - if (starsElement && starCount) { - starsElement.textContent = starCount; - } - - // Scroll to results if they exist - const resultsSection = document.querySelector('[data-results]'); - if (resultsSection) { - resultsSection.scrollIntoView({ behavior: 'smooth', block: 'start' }); - } - }, 0); - }) - .catch(error => { - submitButton.disabled = false; - submitButton.innerHTML = originalContent; - }); -} - -function copyFullDigest() { - const directoryStructure = document.querySelector('.directory-structure').value; - const filesContent = document.querySelector('.result-text').value; - const fullDigest = `${directoryStructure}\n\nFiles Content:\n\n${filesContent}`; - const button = document.querySelector('[onclick="copyFullDigest()"]'); - const originalText = button.innerHTML; - - navigator.clipboard.writeText(fullDigest).then(() => { - button.innerHTML = ` - - - - Copied! - `; - - setTimeout(() => { - button.innerHTML = originalText; - }, 2000); - }).catch(err => { - console.error('Failed to copy text: ', err); - }); -} - -// Add the logSliderToSize helper function -function logSliderToSize(position) { - const minp = 0; - const maxp = 500; - const minv = Math.log(1); - const maxv = Math.log(102400); - - const value = Math.exp(minv + (maxv - minv) * Math.pow(position / maxp, 1.5)); - return Math.round(value); -} - -// Move slider initialization to a separate function -function initializeSlider() { - const slider = document.getElementById('file_size'); - const sizeValue = document.getElementById('size_value'); - - if (!slider || !sizeValue) return; - - function updateSlider() { - const value = logSliderToSize(slider.value); - sizeValue.textContent = formatSize(value); - slider.style.backgroundSize = `${(slider.value / slider.max) * 100}% 100%`; - } - - // Update on slider change - slider.addEventListener('input', updateSlider); - - // Initialize slider position - updateSlider(); -} - -// Add helper function for formatting size -function formatSize(sizeInKB) { - if (sizeInKB >= 1024) { - return Math.round(sizeInKB / 1024) + 'mb'; - } - return Math.round(sizeInKB) + 'kb'; -} - -// Initialize slider on page load -document.addEventListener('DOMContentLoaded', initializeSlider); - -// Make sure these are available globally -window.copyText = copyText; - -window.handleSubmit = handleSubmit; -window.initializeSlider = initializeSlider; -window.formatSize = formatSize; - -// Add this new function -function setupGlobalEnterHandler() { - document.addEventListener('keydown', function (event) { - if (event.key === 'Enter' && !event.target.matches('textarea')) { - const form = document.getElementById('ingestForm'); - if (form) { - handleSubmit(new Event('submit'), true); - } - } - }); -} - -// Add to the DOMContentLoaded event listener -document.addEventListener('DOMContentLoaded', () => { - initializeSlider(); - setupGlobalEnterHandler(); -}); - - -================================================ -File: /src\static\robots.txt -================================================ -User-agent: * -Allow: / -Allow: /api/ -Allow: /cyclotruc/gitingest/ - - - -================================================ -File: /src\templates\api.jinja -================================================ -{% extends "base.jinja" %} - -{% block title %}Git ingest API{% endblock %} - -{% block content %} -
-
-
-

API Documentation

- - -
-
-
-
- - - -
-
-

- The API is currently under development.. -

-
-
-
-

- We're working on making our API available to the public. - In the meantime, you can - - open an issue on github - - to suggest features. -

-
-
-
-{% endblock %} - -================================================ -File: /src\templates\base.jinja -================================================ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {% block title %}Git ingest{% endblock %} - - - - - - {% block extra_head %}{% endblock %} - - - - - {% include 'components/navbar.jinja' %} - - -
-
- {% block content %}{% endblock %} -
-
- - {% include 'components/footer.jinja' %} - - {% block extra_scripts %}{% endblock %} - - - -================================================ -File: /src\templates\components\footer.jinja -================================================ - - -================================================ -File: /src\templates\components\github_form.jinja -================================================ -
-
-
- - -
-
-
- -
-
-
- -
- - -
- -
- -
-
-
-
-
- - - - -
- -
-
-
- -
- - -
-
- - {% if show_examples %} - -
-

Try these example repositories:

-
- {% for example in examples %} - - {% endfor %} -
-
- {% endif %} -
-
- -================================================ -File: /src\templates\components\navbar.jinja -================================================ - - -
-
-
- - - - - -
-
-
- -================================================ -File: /src\templates\components\result.jinja -================================================ -{% if result %} -
-
-
-
- -
- -
-
-

Summary

-
- - -
-
-
- -
- {% if ingest_id %} - -
-
-
- -
- {% endif %} - - -
- - -
-
-

Directory Structure

-
-
-
- -
-
-
-
-
- -
-
-
- - -
-
-

Files Content

-
-
-
- -
-
-
-
-
- -
-
-
-
-
-{% endif %} - -================================================ -File: /src\templates\github.jinja -================================================ -{% extends "base.jinja" %} - -{% block content %} -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=false %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% if loading %} -
-
-
-
-

Loading...

-
-
-{% endif %} - -{% include 'components/result.jinja' %} -{% endblock content %} - -{% block extra_scripts %} - -{% endblock extra_scripts %} - -================================================ -File: /src\templates\index.jinja -================================================ -{% extends "base.jinja" %} - -{% block extra_head %} - -{% endblock %} - -{% block content %} -
-
- - - - -

- Prompt-friendly
codebase  -

- -
-

- Turn any Git repository into a simple text ingest of its codebase. -

-

- This is useful for feeding a codebase into any LLM. -

-

- You can also replace 'hub' with 'ingest' in any Github URL -

-
- -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=true %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% include 'components/result.jinja' %} - - - - -{% endblock %} - From c281db3a1776eefe3bbccb43cd0dbf20c0a24fae Mon Sep 17 00:00:00 2001 From: Dai Hung PHAM Date: Sun, 29 Dec 2024 11:42:23 +0100 Subject: [PATCH 5/6] fix(ignore-patterns): resolve .git and nested path exclusions with improved .gitignore parsing --- src/gitingest/ingest_from_query.py | 24 ++++- src/gitingest/parse_query.py | 139 +++++++++++++++++++++++++++-- 2 files changed, 153 insertions(+), 10 deletions(-) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 4e7d5e78..7bd1bc7b 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -19,14 +19,28 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo return include def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: - rel_path = path.replace(base_path, "").lstrip(os.sep) + """ + Check if a file or directory should be ignored. + + Args: + path (str): Path to check. + base_path (str): Root base path. + ignore_patterns (List[str]): List of patterns to ignore. + + Returns: + bool: True if the path should be ignored. + """ + rel_path = os.path.relpath(path, base_path).replace("\\", "/") for pattern in ignore_patterns: - if pattern == '': - continue - if fnmatch(rel_path, pattern): + if fnmatch(rel_path, pattern) or fnmatch(os.path.basename(path), pattern): + return True + # Special case for directories ending with / + if os.path.isdir(path) and fnmatch(rel_path + '/', pattern): return True return False + + def is_safe_symlink(symlink_path: str, base_path: str) -> bool: """Check if a symlink points to a location within the base directory.""" try: @@ -96,8 +110,10 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = try: for item in os.listdir(path): item_path = os.path.join(path, item) + print(f"Checking path: {path}") if should_exclude(item_path, base_path, ignore_patterns): + print(f"Checking path: {path}") continue is_file = os.path.isfile(item_path) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 8b8f97a8..572e2571 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -93,40 +93,81 @@ def parse_url(url: str) -> dict: parsed["subpath"] = "/" + "/".join(path_parts[4:]) return parsed +### 📝 **Normalize Pattern** def normalize_pattern(pattern: str) -> str: + """ + Normalize a pattern by stripping and formatting. + + Args: + pattern (str): The ignore pattern. + + Returns: + str: Normalized pattern. + """ pattern = pattern.strip() pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" return pattern +### 📝 **Parse Patterns** def parse_patterns(pattern: Union[List[str], str]) -> List[str]: + """ + Parse and validate patterns. + + Args: + pattern (Union[List[str], str]): Patterns to parse. + + Returns: + List[str]: Parsed patterns. + """ if isinstance(pattern, list): pattern = ",".join(pattern) for p in pattern.split(","): if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") - patterns = [normalize_pattern(p) for p in pattern.split(",")] - return patterns + raise ValueError( + f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) + return [normalize_pattern(p) for p in pattern.split(",")] +### 📝 **Override Ignore Patterns** def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: + """ + Remove include patterns from ignore patterns. + + Args: + ignore_patterns (List[str]): Ignore patterns. + include_patterns (List[str]): Include patterns. + + Returns: + List[str]: Updated ignore patterns. + """ for pattern in include_patterns: if pattern in ignore_patterns: ignore_patterns.remove(pattern) return ignore_patterns +### 📝 **Parse Path** def parse_path(path: str) -> dict: - - query = { + """ + Parse a local file path. + + Args: + path (str): File path. + + Returns: + dict: Parsed path details. + """ + return { "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), "subpath": "/", "id": str(uuid.uuid4()), "url": None, } - return query + def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: if from_web: @@ -154,3 +195,89 @@ def parse_query(source: str, max_file_size: int, from_web: bool, include_pattern return query +### 📝 **Parse .gitignore** +def parse_gitignore(gitignore_path: str) -> List[str]: + """ + Parse .gitignore and return ignore patterns. + + Args: + gitignore_path (str): Path to the .gitignore file. + + Returns: + List[str]: List of ignore patterns. + """ + ignore_patterns = [] + if os.path.exists(gitignore_path): + with open(gitignore_path, 'r', encoding='utf-8') as file: + for line in file: + line = line.strip() + if line and not line.startswith('#'): + # Ensure directory patterns end with '/' + if os.path.isdir(os.path.join(os.path.dirname(gitignore_path), line)): + line = line.rstrip('/') + '/' + ignore_patterns.append(line) + return ignore_patterns + + +### 📝 **Parse Query** +def parse_query(source: str, max_file_size: int, from_web: bool, + include_patterns: Union[List[str], str] = None, + ignore_patterns: Union[List[str], str] = None) -> dict: + """ + Parse the query and apply ignore patterns. + + Args: + source (str): Source path or URL. + max_file_size (int): Maximum file size. + from_web (bool): Web source or local. + include_patterns (Union[List[str], str]): Include patterns. + ignore_patterns (Union[List[str], str]): Ignore patterns. + + Returns: + dict: Query object with patterns. + """ + if from_web: + query = parse_url(source) + else: + query = parse_path(source) + + query['max_file_size'] = max_file_size + + # Start with default ignore patterns + final_ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy() + + # Load from .gitignore + gitignore_path = os.path.join(query['local_path'], '.gitignore') + print(f"find .gitignore on project --> {gitignore_path}") + + if os.path.exists(gitignore_path): + gitignore_patterns = parse_gitignore(gitignore_path) + final_ignore_patterns.extend(gitignore_patterns) + print(f"\n🛡️ Patterns from: {gitignore_path}") + for pattern in gitignore_patterns: + print(f" - {pattern}") + # Add user-defined ignore patterns + if ignore_patterns: + final_ignore_patterns.extend(parse_patterns(ignore_patterns)) + + # Handle include patterns + if include_patterns: + include_patterns = parse_patterns(include_patterns) + final_ignore_patterns = override_ignore_patterns(final_ignore_patterns, include_patterns) + + query['ignore_patterns'] = final_ignore_patterns + query['include_patterns'] = include_patterns + # 🖨️ Print patterns to the console + print("\n🛡️ Applied Ignore Patterns:") + for pattern in final_ignore_patterns: + print(f" - {pattern}") + + if include_patterns: + print("\n✅ Included Patterns:") + for pattern in include_patterns: + print(f" - {pattern}") + else: + print("\n✅ Included Patterns: None") + + return query + return query \ No newline at end of file From 233459bddebba1fd06c4300d5538002be482c00a Mon Sep 17 00:00:00 2001 From: Dai Hung PHAM Date: Sun, 29 Dec 2024 11:52:02 +0100 Subject: [PATCH 6/6] from gitingest.ingest_from_query import MAX_FILE_SIZE --- src/gitingest/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 0886c638..6db5602a 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -2,13 +2,14 @@ import pathlib import click import sys + +from gitingest.ingest_from_query import MAX_FILE_SIZE from .encoding import setup_encoding # Setup encoding first setup_encoding() # Define constants -MAX_FILE_SIZE = 51200 # 50KB by default DEFAULT_IGNORE_PATTERNS = [] def normalize_pattern(pattern: str) -> str: