From d9ac0ea046e68400c2a6bd83cd273f5816860d28 Mon Sep 17 00:00:00 2001 From: Josh Johanning Date: Thu, 29 Jan 2026 10:58:37 -0600 Subject: [PATCH 1/4] feat: add scripts to find large files in multiple repositories --- git/find-large-files-in-repositories.sh | 82 +++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100755 git/find-large-files-in-repositories.sh diff --git a/git/find-large-files-in-repositories.sh b/git/find-large-files-in-repositories.sh new file mode 100755 index 0000000..dc03eff --- /dev/null +++ b/git/find-large-files-in-repositories.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# Finds files over a specified size (default: 100MB) across multiple git repositories +# Reads repository URLs from a text file (one URL per line) +# +# Prerequisites: +# - git must be installed +# - For macOS: brew install coreutils (for numfmt/gnumfmt) +# +# Usage: +# ./find-large-files-in-repositories.sh [size-in-mb] +# +# Example: +# ./find-large-files-in-repositories.sh repos.txt 100 +# +# repos.txt format (one repository URL per line): +# https://github.com/owner/repo1.git +# https://github.com/owner/repo2.git +# git@github.com:owner/repo3.git + +if [ -z "$1" ]; then + echo "Usage: $0 [size-in-mb]" + echo " repos-file: Path to a text file containing repository URLs (one per line)" + echo " size-in-mb: Minimum file size in MB to report (default: 100)" + exit 1 +fi + +REPOS_FILE="$1" +SIZE_MB="${2:-100}" +SIZE_BYTES=$((SIZE_MB * 1048576)) + +if [ ! -f "$REPOS_FILE" ]; then + echo "Error: File '$REPOS_FILE' not found" + exit 1 +fi + +# Create a temporary directory for clones +TEMP_DIR=$(mktemp -d) +trap "rm -rf $TEMP_DIR" EXIT + +echo "Finding files >= ${SIZE_MB}MB in repositories listed in $REPOS_FILE" +echo "============================================================" +echo "" + +while IFS= read -r repo_url || [ -n "$repo_url" ]; do + # Skip empty lines and comments + [[ -z "$repo_url" || "$repo_url" =~ ^# ]] && continue + + repo_name=$(basename "$repo_url" .git) + echo "=== Checking: $repo_name ===" + echo " URL: $repo_url" + + clone_path="$TEMP_DIR/$repo_name.git" + + if ! git clone --bare "$repo_url" "$clone_path" 2>/dev/null; then + echo " Error: Failed to clone repository" + echo "" + continue + fi + + cd "$clone_path" || continue + + # Find files over the specified size + large_files=$(git rev-list --objects --all 2>/dev/null | \ + git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' 2>/dev/null | \ + awk -v size="$SIZE_BYTES" '/^blob/ && $3 >= size {printf " %.2fMB %s\n", $3/1048576, $4}' | \ + sort -rn) + + if [ -n "$large_files" ]; then + echo "$large_files" + else + echo " No files >= ${SIZE_MB}MB found" + fi + + cd - > /dev/null || exit + rm -rf "$clone_path" + + echo "" +done < "$REPOS_FILE" + +echo "============================================================" +echo "Scan complete" From 723a7bb418200476df709befbfb4c25246ccd525 Mon Sep 17 00:00:00 2001 From: Josh Johanning Date: Thu, 29 Jan 2026 10:58:54 -0600 Subject: [PATCH 2/4] refactor: rename script to find a large file in a single repository --- git/README.md | 12 ++++++++++-- ...ge-files.sh => find-large-files-in-repository.sh} | 0 2 files changed, 10 insertions(+), 2 deletions(-) rename git/{find-large-files.sh => find-large-files-in-repository.sh} (100%) diff --git a/git/README.md b/git/README.md index 321a930..fb15adb 100644 --- a/git/README.md +++ b/git/README.md @@ -32,9 +32,17 @@ Perform a diff (like in `git log -p`), but only for a single file Extracts the authors of all commits in a repo and the number of commits they made -## find-large-files.sh +## find-large-files-in-repositories.sh -Find large files in a git repo +Find files over a specified size (default: 100MB) across multiple git repositories from a list of URLs + +```bash +./find-large-files-in-repositories.sh repos.txt 100 +``` + +## find-large-files-in-repository.sh + +Find large files in a git repository ## force-push-tag.sh diff --git a/git/find-large-files.sh b/git/find-large-files-in-repository.sh similarity index 100% rename from git/find-large-files.sh rename to git/find-large-files-in-repository.sh From 7dae673ee00c8fa96fe2e50221b16c79414dbeca Mon Sep 17 00:00:00 2001 From: Josh Johanning Date: Thu, 29 Jan 2026 11:15:43 -0600 Subject: [PATCH 3/4] fix: improve error handling and validation in find-large-files script --- git/find-large-files-in-repositories.sh | 44 +++++++++++++++++-------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/git/find-large-files-in-repositories.sh b/git/find-large-files-in-repositories.sh index dc03eff..cc7d602 100755 --- a/git/find-large-files-in-repositories.sh +++ b/git/find-large-files-in-repositories.sh @@ -5,7 +5,7 @@ # # Prerequisites: # - git must be installed -# - For macOS: brew install coreutils (for numfmt/gnumfmt) +# - Standard Unix tools: awk, sort, mktemp, basename # # Usage: # ./find-large-files-in-repositories.sh [size-in-mb] @@ -27,6 +27,13 @@ fi REPOS_FILE="$1" SIZE_MB="${2:-100}" + +# Validate SIZE_MB is a positive integer +if ! [[ "$SIZE_MB" =~ ^[1-9][0-9]*$ ]]; then + echo "Error: size-in-mb must be a positive integer (received: '$SIZE_MB')" + exit 1 +fi + SIZE_BYTES=$((SIZE_MB * 1048576)) if [ ! -f "$REPOS_FILE" ]; then @@ -36,7 +43,7 @@ fi # Create a temporary directory for clones TEMP_DIR=$(mktemp -d) -trap "rm -rf $TEMP_DIR" EXIT +trap 'if [ -n "${TEMP_DIR:-}" ]; then rm -rf -- "$TEMP_DIR"; fi' EXIT echo "Finding files >= ${SIZE_MB}MB in repositories listed in $REPOS_FILE" echo "============================================================" @@ -52,8 +59,10 @@ while IFS= read -r repo_url || [ -n "$repo_url" ]; do clone_path="$TEMP_DIR/$repo_name.git" - if ! git clone --bare "$repo_url" "$clone_path" 2>/dev/null; then + if ! clone_output=$(git clone --bare "$repo_url" "$clone_path" 2>&1); then echo " Error: Failed to clone repository" + echo " git clone output:" + echo "$clone_output" | sed 's/^/ /' echo "" continue fi @@ -61,16 +70,25 @@ while IFS= read -r repo_url || [ -n "$repo_url" ]; do cd "$clone_path" || continue # Find files over the specified size - large_files=$(git rev-list --objects --all 2>/dev/null | \ - git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' 2>/dev/null | \ - awk -v size="$SIZE_BYTES" '/^blob/ && $3 >= size {printf " %.2fMB %s\n", $3/1048576, $4}' | \ - sort -rn) - - if [ -n "$large_files" ]; then - echo "$large_files" - else - echo " No files >= ${SIZE_MB}MB found" - fi + # Use tab delimiter to preserve filenames with spaces + git rev-list --objects --all 2>/dev/null | \ + git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' 2>/dev/null | \ + awk -v size="$SIZE_BYTES" -F '\t' '{ + split($1, meta, " ") + if (meta[1] == "blob" && meta[3] >= size) { + printf " %.2fMB %s\n", meta[3]/1048576, $2 + } + }' | \ + sort -rn | { + found=0 + while IFS= read -r line; do + found=1 + echo "$line" + done + if [ "$found" -eq 0 ]; then + echo " No files >= ${SIZE_MB}MB found" + fi + } cd - > /dev/null || exit rm -rf "$clone_path" From af0c92bd7827a48b08b4ade3de12a7f035705e03 Mon Sep 17 00:00:00 2001 From: Josh Johanning Date: Thu, 29 Jan 2026 11:22:23 -0600 Subject: [PATCH 4/4] fix: avoid repository name collisions by using a hash in clone path --- git/find-large-files-in-repositories.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/git/find-large-files-in-repositories.sh b/git/find-large-files-in-repositories.sh index cc7d602..f51f500 100755 --- a/git/find-large-files-in-repositories.sh +++ b/git/find-large-files-in-repositories.sh @@ -57,12 +57,15 @@ while IFS= read -r repo_url || [ -n "$repo_url" ]; do echo "=== Checking: $repo_name ===" echo " URL: $repo_url" - clone_path="$TEMP_DIR/$repo_name.git" + # Use hash of URL to avoid collisions when repos have the same name + repo_hash=$(printf '%s' "$repo_url" | md5 -q 2>/dev/null || printf '%s' "$repo_url" | md5sum | awk '{print $1}') + clone_path="$TEMP_DIR/${repo_name}-${repo_hash}.git" if ! clone_output=$(git clone --bare "$repo_url" "$clone_path" 2>&1); then echo " Error: Failed to clone repository" echo " git clone output:" echo "$clone_output" | sed 's/^/ /' + rm -rf "$clone_path" echo "" continue fi