From d9ac0ea046e68400c2a6bd83cd273f5816860d28 Mon Sep 17 00:00:00 2001
From: Josh Johanning <joshjohanning@github.com>
Date: Thu, 29 Jan 2026 10:58:37 -0600
Subject: [PATCH 1/4] feat: add scripts to find large files in multiple
 repositories

---
 git/find-large-files-in-repositories.sh | 82 +++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100755 git/find-large-files-in-repositories.sh
diff --git a/git/find-large-files-in-repositories.sh b/git/find-large-files-in-repositories.sh
new file mode 100755
index 0000000..dc03eff
--- /dev/null
+++ b/git/find-large-files-in-repositories.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Finds files over a specified size (default: 100MB) across multiple git repositories
+# Reads repository URLs from a text file (one URL per line)
+#
+# Prerequisites:
+# - git must be installed
+# - For macOS: brew install coreutils (for numfmt/gnumfmt)
+#
+# Usage:
+#   ./find-large-files-in-repositories.sh <repos-file> [size-in-mb]
+#
+# Example:
+#   ./find-large-files-in-repositories.sh repos.txt 100
+#
+# repos.txt format (one repository URL per line):
+#   https://github.com/owner/repo1.git
+#   https://github.com/owner/repo2.git
+#   git@github.com:owner/repo3.git
+
+if [ -z "$1" ]; then
+  echo "Usage: $0 <repos-file> [size-in-mb]"
+  echo "  repos-file: Path to a text file containing repository URLs (one per line)"
+  echo "  size-in-mb: Minimum file size in MB to report (default: 100)"
+  exit 1
+fi
+
+REPOS_FILE="$1"
+SIZE_MB="${2:-100}"
+SIZE_BYTES=$((SIZE_MB * 1048576))
+
+if [ ! -f "$REPOS_FILE" ]; then
+  echo "Error: File '$REPOS_FILE' not found"
+  exit 1
+fi
+
+# Create a temporary directory for clones
+TEMP_DIR=$(mktemp -d)
+trap "rm -rf $TEMP_DIR" EXIT
+
+echo "Finding files >= ${SIZE_MB}MB in repositories listed in $REPOS_FILE"
+echo "============================================================"
+echo ""
+
+while IFS= read -r repo_url || [ -n "$repo_url" ]; do
+  # Skip empty lines and comments
+  [[ -z "$repo_url" || "$repo_url" =~ ^# ]] && continue
+
+  repo_name=$(basename "$repo_url" .git)
+  echo "=== Checking: $repo_name ==="
+  echo "    URL: $repo_url"
+
+  clone_path="$TEMP_DIR/$repo_name.git"
+
+  if ! git clone --bare "$repo_url" "$clone_path" 2>/dev/null; then
+    echo "    Error: Failed to clone repository"
+    echo ""
+    continue
+  fi
+
+  cd "$clone_path" || continue
+
+  # Find files over the specified size
+  large_files=$(git rev-list --objects --all 2>/dev/null | \
+    git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' 2>/dev/null | \
+    awk -v size="$SIZE_BYTES" '/^blob/ && $3 >= size {printf "    %.2fMB %s\n", $3/1048576, $4}' | \
+    sort -rn)
+
+  if [ -n "$large_files" ]; then
+    echo "$large_files"
+  else
+    echo "    No files >= ${SIZE_MB}MB found"
+  fi
+
+  cd - > /dev/null || exit
+  rm -rf "$clone_path"
+
+  echo ""
+done < "$REPOS_FILE"
+
+echo "============================================================"
+echo "Scan complete"

From 723a7bb418200476df709befbfb4c25246ccd525 Mon Sep 17 00:00:00 2001
From: Josh Johanning <joshjohanning@github.com>
Date: Thu, 29 Jan 2026 10:58:54 -0600
Subject: [PATCH 2/4] refactor: rename script to find a large file in a single
 repository

---
 git/README.md                                        | 12 ++++++++++--
 ...ge-files.sh => find-large-files-in-repository.sh} |  0
 2 files changed, 10 insertions(+), 2 deletions(-)
 rename git/{find-large-files.sh => find-large-files-in-repository.sh} (100%)

diff --git a/git/README.md b/git/README.md
index 321a930..fb15adb 100644
--- a/git/README.md
+++ b/git/README.md
@@ -32,9 +32,17 @@ Perform a diff (like in `git log -p`), but only for a single file
 
 Extracts the authors of all commits in a repo and the number of commits they made
 
-## find-large-files.sh
+## find-large-files-in-repositories.sh
 
-Find large files in a git repo
+Find files over a specified size (default: 100MB) across multiple git repositories from a list of URLs
+
+```bash
+./find-large-files-in-repositories.sh repos.txt 100
+```
+
+## find-large-files-in-repository.sh
+
+Find large files in a git repository
 
 ## force-push-tag.sh
 
diff --git a/git/find-large-files.sh b/git/find-large-files-in-repository.sh
similarity index 100%
rename from git/find-large-files.sh
rename to git/find-large-files-in-repository.sh

From 7dae673ee00c8fa96fe2e50221b16c79414dbeca Mon Sep 17 00:00:00 2001
From: Josh Johanning <joshjohanning@github.com>
Date: Thu, 29 Jan 2026 11:15:43 -0600
Subject: [PATCH 3/4] fix: improve error handling and validation in
 find-large-files script

---
 git/find-large-files-in-repositories.sh | 44 +++++++++++++++++--------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/git/find-large-files-in-repositories.sh b/git/find-large-files-in-repositories.sh
index dc03eff..cc7d602 100755
--- a/git/find-large-files-in-repositories.sh
+++ b/git/find-large-files-in-repositories.sh
@@ -5,7 +5,7 @@
 #
 # Prerequisites:
 # - git must be installed
-# - For macOS: brew install coreutils (for numfmt/gnumfmt)
+# - Standard Unix tools: awk, sort, mktemp, basename
 #
 # Usage:
 #   ./find-large-files-in-repositories.sh <repos-file> [size-in-mb]
@@ -27,6 +27,13 @@ fi
 
 REPOS_FILE="$1"
 SIZE_MB="${2:-100}"
+
+# Validate SIZE_MB is a positive integer
+if ! [[ "$SIZE_MB" =~ ^[1-9][0-9]*$ ]]; then
+  echo "Error: size-in-mb must be a positive integer (received: '$SIZE_MB')"
+  exit 1
+fi
+
 SIZE_BYTES=$((SIZE_MB * 1048576))
 
 if [ ! -f "$REPOS_FILE" ]; then
@@ -36,7 +43,7 @@ fi
 
 # Create a temporary directory for clones
 TEMP_DIR=$(mktemp -d)
-trap "rm -rf $TEMP_DIR" EXIT
+trap 'if [ -n "${TEMP_DIR:-}" ]; then rm -rf -- "$TEMP_DIR"; fi' EXIT
 
 echo "Finding files >= ${SIZE_MB}MB in repositories listed in $REPOS_FILE"
 echo "============================================================"
@@ -52,8 +59,10 @@ while IFS= read -r repo_url || [ -n "$repo_url" ]; do
 
   clone_path="$TEMP_DIR/$repo_name.git"
 
-  if ! git clone --bare "$repo_url" "$clone_path" 2>/dev/null; then
+  if ! clone_output=$(git clone --bare "$repo_url" "$clone_path" 2>&1); then
     echo "    Error: Failed to clone repository"
+    echo "    git clone output:"
+    echo "$clone_output" | sed 's/^/      /'
     echo ""
     continue
   fi
@@ -61,16 +70,25 @@ while IFS= read -r repo_url || [ -n "$repo_url" ]; do
   cd "$clone_path" || continue
 
   # Find files over the specified size
-  large_files=$(git rev-list --objects --all 2>/dev/null | \
-    git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' 2>/dev/null | \
-    awk -v size="$SIZE_BYTES" '/^blob/ && $3 >= size {printf "    %.2fMB %s\n", $3/1048576, $4}' | \
-    sort -rn)
-
-  if [ -n "$large_files" ]; then
-    echo "$large_files"
-  else
-    echo "    No files >= ${SIZE_MB}MB found"
-  fi
+  # Use tab delimiter to preserve filenames with spaces
+  git rev-list --objects --all 2>/dev/null | \
+    git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize)	%(rest)' 2>/dev/null | \
+    awk -v size="$SIZE_BYTES" -F '\t' '{
+      split($1, meta, " ")
+      if (meta[1] == "blob" && meta[3] >= size) {
+        printf "    %.2fMB %s\n", meta[3]/1048576, $2
+      }
+    }' | \
+    sort -rn | {
+      found=0
+      while IFS= read -r line; do
+        found=1
+        echo "$line"
+      done
+      if [ "$found" -eq 0 ]; then
+        echo "    No files >= ${SIZE_MB}MB found"
+      fi
+    }
 
   cd - > /dev/null || exit
   rm -rf "$clone_path"

From af0c92bd7827a48b08b4ade3de12a7f035705e03 Mon Sep 17 00:00:00 2001
From: Josh Johanning <joshjohanning@github.com>
Date: Thu, 29 Jan 2026 11:22:23 -0600
Subject: [PATCH 4/4] fix: avoid repository name collisions by using a hash in
 clone path

---
 git/find-large-files-in-repositories.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/git/find-large-files-in-repositories.sh b/git/find-large-files-in-repositories.sh
index cc7d602..f51f500 100755
--- a/git/find-large-files-in-repositories.sh
+++ b/git/find-large-files-in-repositories.sh
@@ -57,12 +57,15 @@ while IFS= read -r repo_url || [ -n "$repo_url" ]; do
   echo "=== Checking: $repo_name ==="
   echo "    URL: $repo_url"
 
-  clone_path="$TEMP_DIR/$repo_name.git"
+  # Use hash of URL to avoid collisions when repos have the same name
+  repo_hash=$(printf '%s' "$repo_url" | md5 -q 2>/dev/null || printf '%s' "$repo_url" | md5sum | awk '{print $1}')
+  clone_path="$TEMP_DIR/${repo_name}-${repo_hash}.git"
 
   if ! clone_output=$(git clone --bare "$repo_url" "$clone_path" 2>&1); then
     echo "    Error: Failed to clone repository"
     echo "    git clone output:"
     echo "$clone_output" | sed 's/^/      /'
+    rm -rf "$clone_path"
     echo ""
     continue
   fi