From 7ff5a69d12e44467f2fa2f012c0747a0542ccfcd Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Wed, 7 Jan 2026 05:27:40 +0100
Subject: [PATCH 01/10] report_automation

---
 scripts/shared.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/shared.py b/scripts/shared.py
index 509801d9..1c9b8b57 100644
--- a/scripts/shared.py
+++ b/scripts/shared.py
@@ -269,6 +269,9 @@ def setup(current_file):
     return logger, paths
 
 
+# def section_order():
+
+
 def update_readme(
     args,
     section_title,
@@ -299,12 +302,12 @@ def update_readme(
     readme_path = path_join(paths["data"], args.quarter, "README.md")
 
     # Define section markers for each data source
-    section_start_line = f"<!-- {section_title} Start -->\n"
-    section_end_line = f"<!-- {section_title} End -->\n"
+    section_start_line = f"<!-- section start {section_title} -->\n"
+    section_end_line = f"<!-- section end {section_title} -->\n"
 
     # Define entry markers for each plot (optional) and description
-    entry_start_line = f"<!-- {entry_title} Start -->\n"
-    entry_end_line = f"<!-- {entry_title} End -->\n"
+    entry_start_line = f"<!-- entry start {entry_title} -->\n"
+    entry_end_line = f"<!-- entry end {entry_title} -->\n"
 
     if os.path.exists(readme_path):
         with open(readme_path, "r", encoding="utf-8") as f:

From a7645a59b643bf2b3c62574386983496846807e9 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Wed, 7 Jan 2026 06:42:37 +0100
Subject: [PATCH 02/10] Add function for listing out section order

---
 .../{gcs_report.py => 01-gcs_report.py}       |   2 +-
 .../{github_report.py => 02-github_report.py} |   2 +-
 scripts/3-report/03-openverse_report.py       | 417 ++++++++++++++++++
 ...pedia_report.py => 04-wikipedia_report.py} |   2 +-
 scripts/3-report/{notes.py => 100-notes.py}   |   2 +-
 scripts/shared.py                             |   5 +-
 6 files changed, 425 insertions(+), 5 deletions(-)
 rename scripts/3-report/{gcs_report.py => 01-gcs_report.py} (99%)
 mode change 100755 => 100644
 rename scripts/3-report/{github_report.py => 02-github_report.py} (99%)
 mode change 100755 => 100644
 create mode 100644 scripts/3-report/03-openverse_report.py
 rename scripts/3-report/{wikipedia_report.py => 04-wikipedia_report.py} (99%)
 mode change 100755 => 100644
 rename scripts/3-report/{notes.py => 100-notes.py} (99%)
 mode change 100755 => 100644

diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/01-gcs_report.py
old mode 100755
new mode 100644
similarity index 99%
rename from scripts/3-report/gcs_report.py
rename to scripts/3-report/01-gcs_report.py
index 359796a9..17103561
--- a/scripts/3-report/gcs_report.py
+++ b/scripts/3-report/01-gcs_report.py
@@ -27,7 +27,7 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "Google Custom Search (GCS)"
+SECTION = "1-gcs_report.py"
 
 
 def parse_arguments():
diff --git a/scripts/3-report/github_report.py b/scripts/3-report/02-github_report.py
old mode 100755
new mode 100644
similarity index 99%
rename from scripts/3-report/github_report.py
rename to scripts/3-report/02-github_report.py
index 37979175..34a83b65
--- a/scripts/3-report/github_report.py
+++ b/scripts/3-report/02-github_report.py
@@ -25,7 +25,7 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "GitHub data"
+SECTION = "2-github_report.py"
 
 
 def parse_arguments():
diff --git a/scripts/3-report/03-openverse_report.py b/scripts/3-report/03-openverse_report.py
new file mode 100644
index 00000000..47d990a5
--- /dev/null
+++ b/scripts/3-report/03-openverse_report.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python
+"""
+This file is dedicated to visualizing and analyzing the data collected
+from Openverse.
+"""
+# Standard library
+import argparse
+import os
+import sys
+import textwrap
+import traceback
+
+# Third-party
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import plot  # noqa: E402
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+QUARTER = os.path.basename(PATHS["data_quarter"])
+SECTION = "3-openverse_report.py"
+
+
+def parse_arguments():
+    """
+    Parses command-line arguments, returns parsed arguments.
+    """
+    LOGGER.info("Parsing command-line arguments")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
+    parser.add_argument(
+        "--show-plots",
+        action="store_true",
+        help="Show generated plots (default: False)",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results (default: False)",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def openverse_intro(args):
+    """
+    Write Openverse Introduction.
+    """
+    LOGGER.info(openverse_intro.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_1-fetch"],
+        "openverse_fetch.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "TOOL_IDENTIFIER"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    total = data["MEDIA_COUNT"].sum()
+    media_counts = data.groupby("MEDIA_TYPE")["MEDIA_COUNT"].sum()
+    total_media = media_counts.sum()
+    audio_percentage = (
+        f"{(media_counts.get('audio', 0) / total_media) * 100:.2f}"
+    )
+    images_percentage = (
+        f"{(media_counts.get('images', 0) / total_media) * 100:.2f}"
+    )
+    unique_sources = data["SOURCE"].nunique()
+    shared.update_readme(
+        args,
+        SECTION,
+        "Overview",
+        None,
+        None,
+        "The Openverse data, below, uses the `Media_count field`"
+        " returned by API for search queries of the various legal tools."
+        "\n"
+        f" The results indicate that there are {total} count of audio"
+        " and images that are licensed or put in the"
+        " public domain using a Creative Commons (CC) legal tool."
+        " They respectively take a percentage of"
+        f" {audio_percentage} and {images_percentage},"
+        " of the total media count returned by the Openverse API."
+        "\n"
+        f"There are {unique_sources} count of"
+        f" data sources under the openverse API.\n"
+        "\n"
+        "Thank you Openverse for providing a public API"
+        " access to its media metadata!",
+    )
+
+
+def plot_totals_by_license_type(args):
+    """
+    Create plots showing totals by license type
+    """
+    LOGGER.info(plot_totals_by_license_type.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "openverse_totals_by_license.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "License"
+    data_label = "Count"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data.sort_values(data_label, ascending=True, inplace=True)
+    title = "Totals by license type"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "openverse_totals_by_license_type.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing Creative Commons (CC) legal tool totals and"
+        " percentages.",
+    )
+
+
+def plot_totals_by_media_type(args):
+    """
+    Create plots showing totals by media type
+    """
+    LOGGER.info(plot_totals_by_media_type.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "openverse_totals_by_media_type.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Media_type"
+    data_label = "Count"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data.sort_values(name_label, ascending=False, inplace=True)
+    title = "Totals by media_type"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "openverse_totals_by_media_type.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing Creative Commons (CC) legal tool"
+        " totals by each media type",
+    )
+
+
+def plot_totals_by_sources(args):
+    """
+    Create plots showing totals by sources
+    """
+    LOGGER.info(plot_totals_by_sources.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "openverse_totals_by_sources.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Source"
+    data_label = "Count"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data.sort_values(name_label, ascending=False, inplace=True)
+    top_10 = data.head(10)
+    title = "Totals by sources"
+    plt = plot.combined_plot(
+        args=args,
+        data=top_10,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(PATHS["data_phase"], "openverse_sources.png")
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing Creative Commons (CC) legal tool totals"
+        " across the top 10 sources returned by openverse API.",
+    )
+
+
+def plot_permissive_by_media_type(args):
+    """
+    Create plots showing the count of permissive content by media type
+    """
+    LOGGER.info(plot_permissive_by_media_type.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "openverse_permissive_by_media_type.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Media_type"
+    data_label = "Count"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data.sort_values(name_label, ascending=False, inplace=True)
+    title = "Permissive content by media type"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "openverse_permissive_by_media_type.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing count of permissive content by media type.",
+    )
+
+
+def plot_permissive_by_source(args):
+    """
+    Create plots showing count of permissive content by source
+    """
+    LOGGER.info(plot_permissive_by_source.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "openverse_permissive_by_source.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Source"
+    data_label = "Count"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data.sort_values(name_label, ascending=True, inplace=True)
+    top_10 = data.head(10)
+    title = "Permissive by source"
+    plt = plot.combined_plot(
+        args=args,
+        data=top_10,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "openverse_permissive_by_source.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing count of permissive content"
+        " by top 10 sources in openverse.",
+    )
+
+
+def plot_totals_by_restriction(args):
+    """
+    Create plots showing totals by restriction
+    """
+    LOGGER.info(plot_totals_by_restriction.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "openverse_totals_by_restriction.csv",
+    )
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Category"
+    data_label = "Count"
+    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
+    data.sort_values(name_label, ascending=False, inplace=True)
+    title = "Totals by restriction"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "openverse_restriction.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing totals by different levels of rights reserved"
+        " on openverse media contents."
+        " This shows the distribution of Public domain,"
+        " Permissive, Copyleft and restricted"
+        " licenses used in Openverse media contents.",
+    )
+
+
+def main():
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+    openverse_intro(args)
+    plot_totals_by_license_type(args)
+    plot_totals_by_media_type(args)
+    plot_permissive_by_media_type(args)
+    plot_permissive_by_source(args)
+    plot_totals_by_restriction(args)
+
+    # Add and commit changes
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit Openverse reports for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.exit_code)
+    except SystemExit as e:
+        if e.code != 0:
+            LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
+        sys.exit(1)
diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/04-wikipedia_report.py
old mode 100755
new mode 100644
similarity index 99%
rename from scripts/3-report/wikipedia_report.py
rename to scripts/3-report/04-wikipedia_report.py
index 83a92fa3..7957c26f
--- a/scripts/3-report/wikipedia_report.py
+++ b/scripts/3-report/04-wikipedia_report.py
@@ -25,7 +25,7 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "Wikipedia data"
+SECTION = "4-wikipedia_report.py"
 
 
 def parse_arguments():
diff --git a/scripts/3-report/notes.py b/scripts/3-report/100-notes.py
old mode 100755
new mode 100644
similarity index 99%
rename from scripts/3-report/notes.py
rename to scripts/3-report/100-notes.py
index ccefd058..97cd8a4d
--- a/scripts/3-report/notes.py
+++ b/scripts/3-report/100-notes.py
@@ -25,7 +25,7 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "Notes"
+SECTION = "100-notes.py"
 
 
 def parse_arguments():
diff --git a/scripts/shared.py b/scripts/shared.py
index 1c9b8b57..2b28c854 100644
--- a/scripts/shared.py
+++ b/scripts/shared.py
@@ -269,7 +269,10 @@ def setup(current_file):
     return logger, paths
 
 
-# def section_order():
+def section_order():
+    report_dir = os.path.join(os.path.dirname(__file__), ".")
+    report_files = os.listdir(report_dir)
+    return report_files
 
 
 def update_readme(

From 297dc8162cb300c68a4dcbbdfce307ee7359bd55 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Wed, 7 Jan 2026 21:56:39 +0100
Subject: [PATCH 03/10] process_script

---
 scripts/2-process/openverse_process.py | 277 +++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 scripts/2-process/openverse_process.py

diff --git a/scripts/2-process/openverse_process.py b/scripts/2-process/openverse_process.py
new file mode 100644
index 00000000..e660b7b8
--- /dev/null
+++ b/scripts/2-process/openverse_process.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python
+"""
+This file is dedicated to processing Openverse data
+for analysis and comparison between quarters.
+"""
+# Standard library
+import argparse
+import csv
+import os
+import sys
+import traceback
+from collections import defaultdict
+
+# Third-party
+import pandas as pd
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+
+# Constants
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results (default: False)",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def check_for_data_file(file_path):
+    if os.path.exists(file_path):
+        raise shared.QuantifyingException(
+            f"Processed data already exists for {QUARTER}", 0
+        )
+
+
+def data_to_csv(args, data, file_path):
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+    # emulate csv.unix_dialect
+    data.to_csv(
+        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
+    )
+
+
+def process_totals_by_license(args, count_data):
+    """
+    Processing count data: totals by license
+    """
+    LOGGER.info(process_totals_by_license.__doc__.strip())
+    data = defaultdict(int)
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        count = int(row.MEDIA_COUNT)
+
+        data[tool] += count
+    data = pd.DataFrame(data.items(), columns=["License", "Count"])
+    data.sort_values("License", ascending=True, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "openverse_totals_by_license.csv"
+    )
+    check_for_data_file(file_path)
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_media_type(args, count_data):
+    """
+    Processing count data: totals by media type
+    """
+
+    LOGGER.info(process_totals_by_media_type.__doc__.strip())
+    data = defaultdict(int)
+
+    for row in count_data.itertuples(index=False):
+        media_type = str(row.MEDIA_TYPE)
+        count = int(row.MEDIA_COUNT)
+
+        data[media_type] += count
+    data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
+    data.sort_values("Media_type", ascending=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "openverse_totals_by_media_type.csv"
+    )
+    check_for_data_file(file_path)
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_source(args, count_data):
+    """
+    Processing count data: totals by source
+    """
+    LOGGER.info(process_totals_by_source.__doc__.strip())
+    data = defaultdict(int)
+    for row in count_data.itertuples(index=False):
+        source = str(row.SOURCE)
+        count = int(row.MEDIA_COUNT)
+
+        data[source] += count
+    data = pd.DataFrame(data.items(), columns=["Source", "Count"])
+    data.sort_values("Source", ascending=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "openverse_totals_by_source.csv"
+    )
+    check_for_data_file(file_path)
+    data_to_csv(args, data, file_path)
+
+
+def process_permissive_by_media_type(args, count_data):
+    """
+    Processing count data: permissive by media type
+    """
+    LOGGER.info(process_permissive_by_media_type.__doc__.strip())
+
+    data = defaultdict(int)
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        media_type = str(row.MEDIA_TYPE)
+        count = int(row.MEDIA_COUNT)
+
+        if tool in ["CC0", "CC BY", "CC BY-SA"]:
+            data[media_type] += count
+
+    data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
+    data.sort_values("Media_type", ascending=True, inplace=True)
+
+    file_path = shared.path_join(
+        PATHS["data_phase"], "openverse_permissive_by_media_type.csv"
+    )
+    check_for_data_file(file_path)
+    data_to_csv(args, data, file_path)
+
+
+def process_permissive_by_source(args, count_data):
+    """
+    Processing count data: permissive content by source
+    """
+    LOGGER.info(process_permissive_by_source.__doc__.strip())
+    data = defaultdict(int)
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        source = str(row.SOURCE)
+        count = int(row.MEDIA_COUNT)
+        if tool in ["CC0", "CC BY", "CC BY-SA"]:
+            data[source] += count
+    data = pd.DataFrame(data.items(), columns=["Source", "Count"])
+    data.sort_values("Source", ascending=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "openverse_permissive_by_source.csv"
+    )
+    check_for_data_file(file_path)
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_restriction(args, count_data):
+    """
+    Processing count data: totals by restriction
+    """
+    # https://creativecommons.org/public-domain/freeworks/
+    LOGGER.info(process_totals_by_restriction.__doc__.strip())
+
+    data = {
+        "Copyleft": 0,
+        "Permissive": 0,
+        "Public domain": 0,
+        "Restricted": 0,
+    }
+
+    for row in count_data.itertuples(index=False):
+        tool = str(row.TOOL_IDENTIFIER)
+        count = int(row.MEDIA_COUNT)
+
+        if tool in ["CC0", "PDM"]:
+            key = "Public domain"
+
+        elif tool in ["CC BY"]:
+            key = "Permissive"
+
+        elif tool in ["CC BY-SA"]:
+            key = "Copyleft"
+
+        else:
+            key = "Restricted"
+
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    data.sort_values("Category", ascending=True, inplace=True)
+
+    file_path = shared.path_join(
+        PATHS["data_phase"], "openverse_totals_by_restriction.csv"
+    )
+    check_for_data_file(file_path)
+    data_to_csv(args, data, file_path)
+
+
+def main():
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv")
+    count_data = shared.open_data_file(
+        LOGGER,
+        file_count,
+        usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"],
+    )
+    process_totals_by_license(args, count_data)
+    process_totals_by_media_type(args, count_data)
+    process_totals_by_source(args, count_data)
+    process_permissive_by_media_type(args, count_data)
+    process_permissive_by_source(args, count_data)
+    process_totals_by_restriction(args, count_data)
+    # Push changes
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new GitHub data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.code)
+    except SystemExit as e:
+        LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
+        sys.exit(1)

From 9f8ffd8cca5efac10e91df7f0a453f9556c57543 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Wed, 7 Jan 2026 22:00:25 +0100
Subject: [PATCH 04/10] made review  changes

---
 scripts/3-report/{01-gcs_report.py => gcs_report.py}           | 3 ++-
 scripts/3-report/{02-github_report.py => github_report.py}     | 3 ++-
 .../3-report/{03-openverse_report.py => openverse_report.py}   | 3 ++-
 .../3-report/{04-wikipedia_report.py => wikipedia_report.py}   | 3 ++-
 scripts/3-report/{100-notes.py => zzz-notes.py}                | 3 ++-
 5 files changed, 10 insertions(+), 5 deletions(-)
 rename scripts/3-report/{01-gcs_report.py => gcs_report.py} (99%)
 rename scripts/3-report/{02-github_report.py => github_report.py} (99%)
 rename scripts/3-report/{03-openverse_report.py => openverse_report.py} (99%)
 rename scripts/3-report/{04-wikipedia_report.py => wikipedia_report.py} (99%)
 rename scripts/3-report/{100-notes.py => zzz-notes.py} (98%)

diff --git a/scripts/3-report/01-gcs_report.py b/scripts/3-report/gcs_report.py
similarity index 99%
rename from scripts/3-report/01-gcs_report.py
rename to scripts/3-report/gcs_report.py
index 17103561..eb2a4581 100644
--- a/scripts/3-report/01-gcs_report.py
+++ b/scripts/3-report/gcs_report.py
@@ -9,6 +9,7 @@
 import sys
 import textwrap
 import traceback
+from pathlib import Path
 
 # Third-party
 from pygments import highlight
@@ -27,7 +28,7 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "1-gcs_report.py"
+SECTION = Path(__file__).name
 
 
 def parse_arguments():
diff --git a/scripts/3-report/02-github_report.py b/scripts/3-report/github_report.py
similarity index 99%
rename from scripts/3-report/02-github_report.py
rename to scripts/3-report/github_report.py
index 34a83b65..b2a18890 100644
--- a/scripts/3-report/02-github_report.py
+++ b/scripts/3-report/github_report.py
@@ -9,6 +9,7 @@
 import sys
 import textwrap
 import traceback
+from pathlib import Path
 
 # Third-party
 from pygments import highlight
@@ -25,7 +26,7 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "2-github_report.py"
+SECTION = Path(__file__).name
 
 
 def parse_arguments():
diff --git a/scripts/3-report/03-openverse_report.py b/scripts/3-report/openverse_report.py
similarity index 99%
rename from scripts/3-report/03-openverse_report.py
rename to scripts/3-report/openverse_report.py
index 47d990a5..2347ae28 100644
--- a/scripts/3-report/03-openverse_report.py
+++ b/scripts/3-report/openverse_report.py
@@ -9,6 +9,7 @@
 import sys
 import textwrap
 import traceback
+from pathlib import Path
 
 # Third-party
 from pygments import highlight
@@ -25,7 +26,7 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "3-openverse_report.py"
+SECTION = Path(__file__).name
 
 
 def parse_arguments():
diff --git a/scripts/3-report/04-wikipedia_report.py b/scripts/3-report/wikipedia_report.py
similarity index 99%
rename from scripts/3-report/04-wikipedia_report.py
rename to scripts/3-report/wikipedia_report.py
index 7957c26f..96f0cf46 100644
--- a/scripts/3-report/04-wikipedia_report.py
+++ b/scripts/3-report/wikipedia_report.py
@@ -9,6 +9,7 @@
 import sys
 import textwrap
 import traceback
+from pathlib import Path
 
 # Third-party
 from pygments import highlight
@@ -25,7 +26,7 @@
 # Setup
 LOGGER, PATHS = shared.setup(__file__)
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "4-wikipedia_report.py"
+SECTION = Path(__file__).name
 
 
 def parse_arguments():
diff --git a/scripts/3-report/100-notes.py b/scripts/3-report/zzz-notes.py
similarity index 98%
rename from scripts/3-report/100-notes.py
rename to scripts/3-report/zzz-notes.py
index 97cd8a4d..d9d4aa93 100644
--- a/scripts/3-report/100-notes.py
+++ b/scripts/3-report/zzz-notes.py
@@ -8,6 +8,7 @@
 import sys
 import textwrap
 import traceback
+from pathlib import Path
 
 # Third-party
 from pygments import highlight
@@ -25,7 +26,7 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = "100-notes.py"
+SECTION = Path(__file__).name
 
 
 def parse_arguments():

From 16cb9c9839fa6589e2eb4cb5e978a1eef9ea0214 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Thu, 8 Jan 2026 13:43:40 +0100
Subject: [PATCH 05/10] made new changes

---
 scripts/shared.py | 55 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/scripts/shared.py b/scripts/shared.py
index 2b28c854..8a3b69cf 100644
--- a/scripts/shared.py
+++ b/scripts/shared.py
@@ -270,7 +270,7 @@ def setup(current_file):
 
 
 def section_order():
-    report_dir = os.path.join(os.path.dirname(__file__), ".")
+    report_dir = os.path.join(os.path.dirname(__file__), "3-report")
     report_files = os.listdir(report_dir)
     return report_files
 
@@ -283,6 +283,11 @@ def update_readme(
     image_caption,
     entry_text=None,
 ):
+    logger = args.logger
+    paths = args.paths
+    ordered_sections = section_order()
+    logger.info("ordered_sections:", ordered_sections)
+    logger.info("section_title:", repr(section_title))
     """
     Update the README.md file with the generated images and descriptions.
     """
@@ -299,9 +304,6 @@ def update_readme(
             " caption is provided"
         )
 
-    logger = args.logger
-    paths = args.paths
-
     readme_path = path_join(paths["data"], args.quarter, "README.md")
 
     # Define section markers for each data source
@@ -324,26 +326,39 @@ def update_readme(
         lines.insert(0, title_line)
         lines.insert(1, "\n")
 
-    # We only need to know the position of the end to append new entries
+    # Locate the data source section if it is already present
     if section_start_line in lines:
-        # Locate the data source section if it is already present
         section_end_index = lines.index(section_end_line)
     else:
-        # Add the data source section if it is absent
-        lines.extend(
-            [
-                f"{section_start_line}",
-                "\n",
-                "\n",
-                f"## {section_title}\n",
-                "\n",
-                "\n",
-                f"{section_end_line}",
-                "\n",
-            ]
-        )
-        section_end_index = lines.index(section_end_line)
+        insert_index = None
+        # If not present, we find the position to insert the section
+        current_postion = ordered_sections.index(section_title)
+        # Sections that should come before this section
+        sections_before = ordered_sections[:current_postion]
+        # we find the last existing section that comes before this section
+        for prev_section in reversed(sections_before):
+            prev_end_line = f"<!-- section end {prev_section} -->\n"
+            if prev_end_line in lines:
+                insert_index = lines.index(prev_end_line) + 1
+                break
 
+        # If none exist, insert at the top (after README title)
+        if insert_index is None:
+            insert_index = 2 if len(lines) >= 2 else len(lines)
+        # Insert the new data source section at correct position
+        new_section_line = [
+            f"{section_start_line}",
+            "\n",
+            "\n",
+            f"## {section_title}\n",
+            "\n",
+            "\n",
+            f"{section_end_line}",
+            "\n",
+        ]
+        # Insert the section at the correct position
+        lines = lines[:insert_index] + new_section_line + lines[insert_index:]
+        section_end_index = lines.index(section_end_line)
     # Locate the entry if it is already present
     if entry_start_line in lines:
         entry_start_index = lines.index(entry_start_line)

From 5ae1584477209577133f88b1242d0d097263a44d Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Thu, 8 Jan 2026 18:30:53 +0100
Subject: [PATCH 06/10] Added check_for_data_file function for gcs_process.py

---
 scripts/2-process/gcs_process.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
index fefbba0f..786d8329 100755
--- a/scripts/2-process/gcs_process.py
+++ b/scripts/2-process/gcs_process.py
@@ -62,6 +62,13 @@ def parse_arguments():
     return args
 
 
+def check_for_data_file(file_path):
+    if os.path.exists(file_path):
+        raise shared.QuantifyingException(
+            f"Processed data already exists for {QUARTER}", 0
+        )
+
+
 def data_to_csv(args, data, file_path):
     if not args.enable_save:
         return
@@ -111,6 +118,7 @@ def process_product_totals(args, count_data):
         data.items(), columns=["CC legal tool product", "Count"]
     )
     file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -192,7 +200,8 @@ def process_latest_prior_retired_totals(args, count_data):
         file_path = shared.path_join(
             PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
         )
-        data_to_csv(args, dataframe, file_path)
+    check_for_data_file(file_path)
+    data_to_csv(args, dataframe, file_path)
 
 
 def process_totals_by_free_cultural(args, count_data):
@@ -225,6 +234,7 @@ def process_totals_by_free_cultural(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -259,6 +269,7 @@ def process_totals_by_restrictions(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -280,6 +291,7 @@ def process_totals_by_language(args, data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_language.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -301,6 +313,7 @@ def process_totals_by_country(args, data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_country.csv"
     )
+    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 

From 80b647501933bbd089993bb822b64c7b00dafaae Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Fri, 9 Jan 2026 13:53:32 +0100
Subject: [PATCH 07/10] Remove openverse process and report

---
 scripts/2-process/openverse_process.py | 277 ----------------
 scripts/3-report/openverse_report.py   | 418 -------------------------
 2 files changed, 695 deletions(-)
 delete mode 100644 scripts/2-process/openverse_process.py
 delete mode 100644 scripts/3-report/openverse_report.py

diff --git a/scripts/2-process/openverse_process.py b/scripts/2-process/openverse_process.py
deleted file mode 100644
index e660b7b8..00000000
--- a/scripts/2-process/openverse_process.py
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/usr/bin/env python
-"""
-This file is dedicated to processing Openverse data
-for analysis and comparison between quarters.
-"""
-# Standard library
-import argparse
-import csv
-import os
-import sys
-import traceback
-from collections import defaultdict
-
-# Third-party
-import pandas as pd
-
-# Add parent directory so shared can be imported
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-
-# First-party/Local
-import shared  # noqa: E402
-
-# Setup
-LOGGER, PATHS = shared.setup(__file__)
-
-# Constants
-QUARTER = os.path.basename(PATHS["data_quarter"])
-
-
-def parse_arguments():
-    """
-    Parse command-line options, returns parsed argument namespace.
-    """
-    LOGGER.info("Parsing command-line options")
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--quarter",
-        default=QUARTER,
-        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
-    )
-    parser.add_argument(
-        "--enable-save",
-        action="store_true",
-        help="Enable saving results (default: False)",
-    )
-    parser.add_argument(
-        "--enable-git",
-        action="store_true",
-        help="Enable git actions such as fetch, merge, add, commit, and push"
-        " (default: False)",
-    )
-    args = parser.parse_args()
-    if not args.enable_save and args.enable_git:
-        parser.error("--enable-git requires --enable-save")
-    if args.quarter != QUARTER:
-        global PATHS
-        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
-    args.logger = LOGGER
-    args.paths = PATHS
-    return args
-
-
-def check_for_data_file(file_path):
-    if os.path.exists(file_path):
-        raise shared.QuantifyingException(
-            f"Processed data already exists for {QUARTER}", 0
-        )
-
-
-def data_to_csv(args, data, file_path):
-    if not args.enable_save:
-        return
-    os.makedirs(PATHS["data_phase"], exist_ok=True)
-    # emulate csv.unix_dialect
-    data.to_csv(
-        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
-    )
-
-
-def process_totals_by_license(args, count_data):
-    """
-    Processing count data: totals by license
-    """
-    LOGGER.info(process_totals_by_license.__doc__.strip())
-    data = defaultdict(int)
-
-    for row in count_data.itertuples(index=False):
-        tool = str(row.TOOL_IDENTIFIER)
-        count = int(row.MEDIA_COUNT)
-
-        data[tool] += count
-    data = pd.DataFrame(data.items(), columns=["License", "Count"])
-    data.sort_values("License", ascending=True, inplace=True)
-    data.reset_index(drop=True, inplace=True)
-    file_path = shared.path_join(
-        PATHS["data_phase"], "openverse_totals_by_license.csv"
-    )
-    check_for_data_file(file_path)
-    data_to_csv(args, data, file_path)
-
-
-def process_totals_by_media_type(args, count_data):
-    """
-    Processing count data: totals by media type
-    """
-
-    LOGGER.info(process_totals_by_media_type.__doc__.strip())
-    data = defaultdict(int)
-
-    for row in count_data.itertuples(index=False):
-        media_type = str(row.MEDIA_TYPE)
-        count = int(row.MEDIA_COUNT)
-
-        data[media_type] += count
-    data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
-    data.sort_values("Media_type", ascending=True, inplace=True)
-    file_path = shared.path_join(
-        PATHS["data_phase"], "openverse_totals_by_media_type.csv"
-    )
-    check_for_data_file(file_path)
-    data_to_csv(args, data, file_path)
-
-
-def process_totals_by_source(args, count_data):
-    """
-    Processing count data: totals by source
-    """
-    LOGGER.info(process_totals_by_source.__doc__.strip())
-    data = defaultdict(int)
-    for row in count_data.itertuples(index=False):
-        source = str(row.SOURCE)
-        count = int(row.MEDIA_COUNT)
-
-        data[source] += count
-    data = pd.DataFrame(data.items(), columns=["Source", "Count"])
-    data.sort_values("Source", ascending=True, inplace=True)
-    file_path = shared.path_join(
-        PATHS["data_phase"], "openverse_totals_by_source.csv"
-    )
-    check_for_data_file(file_path)
-    data_to_csv(args, data, file_path)
-
-
-def process_permissive_by_media_type(args, count_data):
-    """
-    Processing count data: permissive by media type
-    """
-    LOGGER.info(process_permissive_by_media_type.__doc__.strip())
-
-    data = defaultdict(int)
-
-    for row in count_data.itertuples(index=False):
-        tool = str(row.TOOL_IDENTIFIER)
-        media_type = str(row.MEDIA_TYPE)
-        count = int(row.MEDIA_COUNT)
-
-        if tool in ["CC0", "CC BY", "CC BY-SA"]:
-            data[media_type] += count
-
-    data = pd.DataFrame(data.items(), columns=["Media_type", "Count"])
-    data.sort_values("Media_type", ascending=True, inplace=True)
-
-    file_path = shared.path_join(
-        PATHS["data_phase"], "openverse_permissive_by_media_type.csv"
-    )
-    check_for_data_file(file_path)
-    data_to_csv(args, data, file_path)
-
-
-def process_permissive_by_source(args, count_data):
-    """
-    Processing count data: permissive content by source
-    """
-    LOGGER.info(process_permissive_by_source.__doc__.strip())
-    data = defaultdict(int)
-    for row in count_data.itertuples(index=False):
-        tool = str(row.TOOL_IDENTIFIER)
-        source = str(row.SOURCE)
-        count = int(row.MEDIA_COUNT)
-        if tool in ["CC0", "CC BY", "CC BY-SA"]:
-            data[source] += count
-    data = pd.DataFrame(data.items(), columns=["Source", "Count"])
-    data.sort_values("Source", ascending=True, inplace=True)
-    file_path = shared.path_join(
-        PATHS["data_phase"], "openverse_permissive_by_source.csv"
-    )
-    check_for_data_file(file_path)
-    data_to_csv(args, data, file_path)
-
-
-def process_totals_by_restriction(args, count_data):
-    """
-    Processing count data: totals by restriction
-    """
-    # https://creativecommons.org/public-domain/freeworks/
-    LOGGER.info(process_totals_by_restriction.__doc__.strip())
-
-    data = {
-        "Copyleft": 0,
-        "Permissive": 0,
-        "Public domain": 0,
-        "Restricted": 0,
-    }
-
-    for row in count_data.itertuples(index=False):
-        tool = str(row.TOOL_IDENTIFIER)
-        count = int(row.MEDIA_COUNT)
-
-        if tool in ["CC0", "PDM"]:
-            key = "Public domain"
-
-        elif tool in ["CC BY"]:
-            key = "Permissive"
-
-        elif tool in ["CC BY-SA"]:
-            key = "Copyleft"
-
-        else:
-            key = "Restricted"
-
-        data[key] += count
-
-    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
-    data.sort_values("Category", ascending=True, inplace=True)
-
-    file_path = shared.path_join(
-        PATHS["data_phase"], "openverse_totals_by_restriction.csv"
-    )
-    check_for_data_file(file_path)
-    data_to_csv(args, data, file_path)
-
-
-def main():
-    args = parse_arguments()
-    shared.paths_log(LOGGER, PATHS)
-    shared.git_fetch_and_merge(args, PATHS["repo"])
-
-    file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv")
-    count_data = shared.open_data_file(
-        LOGGER,
-        file_count,
-        usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"],
-    )
-    process_totals_by_license(args, count_data)
-    process_totals_by_media_type(args, count_data)
-    process_totals_by_source(args, count_data)
-    process_permissive_by_media_type(args, count_data)
-    process_permissive_by_source(args, count_data)
-    process_totals_by_restriction(args, count_data)
-    # Push changes
-    args = shared.git_add_and_commit(
-        args,
-        PATHS["repo"],
-        PATHS["data_quarter"],
-        f"Add and commit new GitHub data for {QUARTER}",
-    )
-    shared.git_push_changes(args, PATHS["repo"])
-
-
-if __name__ == "__main__":
-    try:
-        main()
-    except shared.QuantifyingException as e:
-        if e.exit_code == 0:
-            LOGGER.info(e.message)
-        else:
-            LOGGER.error(e.message)
-        sys.exit(e.code)
-    except SystemExit as e:
-        LOGGER.error(f"System exit with code: {e.code}")
-        sys.exit(e.code)
-    except KeyboardInterrupt:
-        LOGGER.info("(130) Halted via KeyboardInterrupt.")
-        sys.exit(130)
-    except Exception:
-        LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}")
-        sys.exit(1)
diff --git a/scripts/3-report/openverse_report.py b/scripts/3-report/openverse_report.py
deleted file mode 100644
index 2347ae28..00000000
--- a/scripts/3-report/openverse_report.py
+++ /dev/null
@@ -1,418 +0,0 @@
-#!/usr/bin/env python
-"""
-This file is dedicated to visualizing and analyzing the data collected
-from Openverse.
-"""
-# Standard library
-import argparse
-import os
-import sys
-import textwrap
-import traceback
-from pathlib import Path
-
-# Third-party
-from pygments import highlight
-from pygments.formatters import TerminalFormatter
-from pygments.lexers import PythonTracebackLexer
-
-# Add parent directory so shared can be imported
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
-
-# First-party/Local
-import plot  # noqa: E402
-import shared  # noqa: E402
-
-# Setup
-LOGGER, PATHS = shared.setup(__file__)
-QUARTER = os.path.basename(PATHS["data_quarter"])
-SECTION = Path(__file__).name
-
-
-def parse_arguments():
-    """
-    Parses command-line arguments, returns parsed arguments.
-    """
-    LOGGER.info("Parsing command-line arguments")
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--quarter",
-        default=QUARTER,
-        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
-    )
-    parser.add_argument(
-        "--show-plots",
-        action="store_true",
-        help="Show generated plots (default: False)",
-    )
-    parser.add_argument(
-        "--enable-save",
-        action="store_true",
-        help="Enable saving results (default: False)",
-    )
-    parser.add_argument(
-        "--enable-git",
-        action="store_true",
-        help="Enable git actions such as fetch, merge, add, commit, and push"
-        " (default: False)",
-    )
-    args = parser.parse_args()
-    if not args.enable_save and args.enable_git:
-        parser.error("--enable-git requires --enable-save")
-    if args.quarter != QUARTER:
-        global PATHS
-        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
-    args.logger = LOGGER
-    args.paths = PATHS
-    return args
-
-
-def openverse_intro(args):
-    """
-    Write Openverse Introduction.
-    """
-    LOGGER.info(openverse_intro.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_1-fetch"],
-        "openverse_fetch.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "TOOL_IDENTIFIER"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    total = data["MEDIA_COUNT"].sum()
-    media_counts = data.groupby("MEDIA_TYPE")["MEDIA_COUNT"].sum()
-    total_media = media_counts.sum()
-    audio_percentage = (
-        f"{(media_counts.get('audio', 0) / total_media) * 100:.2f}"
-    )
-    images_percentage = (
-        f"{(media_counts.get('images', 0) / total_media) * 100:.2f}"
-    )
-    unique_sources = data["SOURCE"].nunique()
-    shared.update_readme(
-        args,
-        SECTION,
-        "Overview",
-        None,
-        None,
-        "The Openverse data, below, uses the `Media_count field`"
-        " returned by API for search queries of the various legal tools."
-        "\n"
-        f" The results indicate that there are {total} count of audio"
-        " and images that are licensed or put in the"
-        " public domain using a Creative Commons (CC) legal tool."
-        " They respectively take a percentage of"
-        f" {audio_percentage} and {images_percentage},"
-        " of the total media count returned by the Openverse API."
-        "\n"
-        f"There are {unique_sources} count of"
-        f" data sources under the openverse API.\n"
-        "\n"
-        "Thank you Openverse for providing a public API"
-        " access to its media metadata!",
-    )
-
-
-def plot_totals_by_license_type(args):
-    """
-    Create plots showing totals by license type
-    """
-    LOGGER.info(plot_totals_by_license_type.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_2-process"],
-        "openverse_totals_by_license.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "License"
-    data_label = "Count"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(data_label, ascending=True, inplace=True)
-    title = "Totals by license type"
-    plt = plot.combined_plot(
-        args=args,
-        data=data,
-        title=title,
-        name_label=name_label,
-        data_label=data_label,
-    )
-
-    image_path = shared.path_join(
-        PATHS["data_phase"], "openverse_totals_by_license_type.png"
-    )
-    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
-
-    if args.enable_save:
-        # Create the directory if it does not exist
-        os.makedirs(PATHS["data_phase"], exist_ok=True)
-        plt.savefig(image_path)
-
-    shared.update_readme(
-        args,
-        SECTION,
-        title,
-        image_path,
-        "Plots showing Creative Commons (CC) legal tool totals and"
-        " percentages.",
-    )
-
-
-def plot_totals_by_media_type(args):
-    """
-    Create plots showing totals by media type
-    """
-    LOGGER.info(plot_totals_by_media_type.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_2-process"],
-        "openverse_totals_by_media_type.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Media_type"
-    data_label = "Count"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(name_label, ascending=False, inplace=True)
-    title = "Totals by media_type"
-    plt = plot.combined_plot(
-        args=args,
-        data=data,
-        title=title,
-        name_label=name_label,
-        data_label=data_label,
-    )
-
-    image_path = shared.path_join(
-        PATHS["data_phase"], "openverse_totals_by_media_type.png"
-    )
-    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
-    if args.enable_save:
-        # Create the directory if it does not exist
-        os.makedirs(PATHS["data_phase"], exist_ok=True)
-        plt.savefig(image_path)
-
-    shared.update_readme(
-        args,
-        SECTION,
-        title,
-        image_path,
-        "Plots showing Creative Commons (CC) legal tool"
-        " totals by each media type",
-    )
-
-
-def plot_totals_by_sources(args):
-    """
-    Create plots showing totals by sources
-    """
-    LOGGER.info(plot_totals_by_sources.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_2-process"],
-        "openverse_totals_by_sources.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Source"
-    data_label = "Count"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(name_label, ascending=False, inplace=True)
-    top_10 = data.head(10)
-    title = "Totals by sources"
-    plt = plot.combined_plot(
-        args=args,
-        data=top_10,
-        title=title,
-        name_label=name_label,
-        data_label=data_label,
-    )
-
-    image_path = shared.path_join(PATHS["data_phase"], "openverse_sources.png")
-    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
-    if args.enable_save:
-        # Create the directory if it does not exist
-        os.makedirs(PATHS["data_phase"], exist_ok=True)
-        plt.savefig(image_path)
-
-    shared.update_readme(
-        args,
-        SECTION,
-        title,
-        image_path,
-        "Plots showing Creative Commons (CC) legal tool totals"
-        " across the top 10 sources returned by openverse API.",
-    )
-
-
-def plot_permissive_by_media_type(args):
-    """
-    Create plots showing the count of permissive content by media type
-    """
-    LOGGER.info(plot_permissive_by_media_type.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_2-process"],
-        "openverse_permissive_by_media_type.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Media_type"
-    data_label = "Count"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(name_label, ascending=False, inplace=True)
-    title = "Permissive content by media type"
-    plt = plot.combined_plot(
-        args=args,
-        data=data,
-        title=title,
-        name_label=name_label,
-        data_label=data_label,
-    )
-
-    image_path = shared.path_join(
-        PATHS["data_phase"], "openverse_permissive_by_media_type.png"
-    )
-    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
-    if args.enable_save:
-        # Create the directory if it does not exist
-        os.makedirs(PATHS["data_phase"], exist_ok=True)
-        plt.savefig(image_path)
-
-    shared.update_readme(
-        args,
-        SECTION,
-        title,
-        image_path,
-        "Plots showing count of permissive content by media type.",
-    )
-
-
-def plot_permissive_by_source(args):
-    """
-    Create plots showing count of permissive content by source
-    """
-    LOGGER.info(plot_permissive_by_source.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_2-process"],
-        "openverse_permissive_by_source.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Source"
-    data_label = "Count"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(name_label, ascending=True, inplace=True)
-    top_10 = data.head(10)
-    title = "Permissive by source"
-    plt = plot.combined_plot(
-        args=args,
-        data=top_10,
-        title=title,
-        name_label=name_label,
-        data_label=data_label,
-    )
-
-    image_path = shared.path_join(
-        PATHS["data_phase"], "openverse_permissive_by_source.png"
-    )
-    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
-    if args.enable_save:
-        # Create the directory if it does not exist
-        os.makedirs(PATHS["data_phase"], exist_ok=True)
-        plt.savefig(image_path)
-
-    shared.update_readme(
-        args,
-        SECTION,
-        title,
-        image_path,
-        "Plots showing count of permissive content"
-        " by top 10 sources in openverse.",
-    )
-
-
-def plot_totals_by_restriction(args):
-    """
-    Create plots showing totals by restriction
-    """
-    LOGGER.info(plot_totals_by_restriction.__doc__.strip())
-    file_path = shared.path_join(
-        PATHS["data_2-process"],
-        "openverse_totals_by_restriction.csv",
-    )
-    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
-    name_label = "Category"
-    data_label = "Count"
-    data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
-    data.sort_values(name_label, ascending=False, inplace=True)
-    title = "Totals by restriction"
-    plt = plot.combined_plot(
-        args=args,
-        data=data,
-        title=title,
-        name_label=name_label,
-        data_label=data_label,
-    )
-
-    image_path = shared.path_join(
-        PATHS["data_phase"], "openverse_restriction.png"
-    )
-    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
-    if args.enable_save:
-        # Create the directory if it does not exist
-        os.makedirs(PATHS["data_phase"], exist_ok=True)
-        plt.savefig(image_path)
-
-    shared.update_readme(
-        args,
-        SECTION,
-        title,
-        image_path,
-        "Plots showing totals by different levels of rights reserved"
-        " on openverse media contents."
-        " This shows the distribution of Public domain,"
-        " Permissive, Copyleft and restricted"
-        " licenses used in Openverse media contents.",
-    )
-
-
-def main():
-    args = parse_arguments()
-    shared.paths_log(LOGGER, PATHS)
-    shared.git_fetch_and_merge(args, PATHS["repo"])
-    openverse_intro(args)
-    plot_totals_by_license_type(args)
-    plot_totals_by_media_type(args)
-    plot_permissive_by_media_type(args)
-    plot_permissive_by_source(args)
-    plot_totals_by_restriction(args)
-
-    # Add and commit changes
-    args = shared.git_add_and_commit(
-        args,
-        PATHS["repo"],
-        PATHS["data_quarter"],
-        f"Add and commit Openverse reports for {QUARTER}",
-    )
-    shared.git_push_changes(args, PATHS["repo"])
-
-
-if __name__ == "__main__":
-    try:
-        main()
-    except shared.QuantifyingException as e:
-        if e.exit_code == 0:
-            LOGGER.info(e.message)
-        else:
-            LOGGER.error(e.message)
-        sys.exit(e.exit_code)
-    except SystemExit as e:
-        if e.code != 0:
-            LOGGER.error(f"System exit with code: {e.code}")
-        sys.exit(e.code)
-    except KeyboardInterrupt:
-        LOGGER.info("(130) Halted via KeyboardInterrupt.")
-        sys.exit(130)
-    except Exception:
-        traceback_formatted = textwrap.indent(
-            highlight(
-                traceback.format_exc(),
-                PythonTracebackLexer(),
-                TerminalFormatter(),
-            ),
-            "    ",
-        )
-        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
-        sys.exit(1)

From 7eba804e508f7972cd42ba4d9ab08a2dab6aed3e Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Fri, 9 Jan 2026 15:25:05 +0100
Subject: [PATCH 08/10] Made review changes

---
 scripts/2-process/gcs_process.py       | 22 ++++++++++++++--------
 scripts/2-process/github_process.py    | 10 ++++++----
 scripts/2-process/wikipedia_process.py | 17 +++++++++++++----
 scripts/shared.py                      | 11 ++++++-----
 4 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
index 786d8329..0ee57016 100755
--- a/scripts/2-process/gcs_process.py
+++ b/scripts/2-process/gcs_process.py
@@ -27,6 +27,17 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
+FILE_PATHS = [
+    shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"),
+    shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"),
+]
 
 
 def parse_arguments():
@@ -62,7 +73,7 @@ def parse_arguments():
     return args
 
 
-def check_for_data_file(file_path):
+def check_for_data_files(file_path):
     if os.path.exists(file_path):
         raise shared.QuantifyingException(
             f"Processed data already exists for {QUARTER}", 0
@@ -118,7 +129,6 @@ def process_product_totals(args, count_data):
         data.items(), columns=["CC legal tool product", "Count"]
     )
     file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv")
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -200,8 +210,7 @@ def process_latest_prior_retired_totals(args, count_data):
         file_path = shared.path_join(
             PATHS["data_phase"], f"gcs_status_{key}_totals.csv"
         )
-    check_for_data_file(file_path)
-    data_to_csv(args, dataframe, file_path)
+        data_to_csv(args, dataframe, file_path)
 
 
 def process_totals_by_free_cultural(args, count_data):
@@ -234,7 +243,6 @@ def process_totals_by_free_cultural(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -269,7 +277,6 @@ def process_totals_by_restrictions(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_restrictions.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -291,7 +298,6 @@ def process_totals_by_language(args, data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_language.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -313,7 +319,6 @@ def process_totals_by_country(args, data):
     file_path = shared.path_join(
         PATHS["data_phase"], "gcs_totals_by_country.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -321,6 +326,7 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
+    check_for_data_files(FILE_PATHS)
 
     # Count data
     file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
index 27945613..192b864b 100755
--- a/scripts/2-process/github_process.py
+++ b/scripts/2-process/github_process.py
@@ -24,6 +24,10 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
+FILE_PATHS = [
+    shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"),
+    shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"),
+]
 
 
 def parse_arguments():
@@ -59,7 +63,7 @@ def parse_arguments():
     return args
 
 
-def check_for_data_file(file_path):
+def check_for_data_files(file_path):
     if os.path.exists(file_path):
         raise shared.QuantifyingException(
             f"Processed data already exists for {QUARTER}", 0
@@ -98,7 +102,6 @@ def process_totals_by_license(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "github_totals_by_license.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -133,7 +136,6 @@ def process_totals_by_restriction(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "github_totals_by_restriction.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, data, file_path)
 
 
@@ -141,7 +143,7 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
-
+    check_for_data_files(FILE_PATHS)
     file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
     count_data = shared.open_data_file(
         LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
index 7712b26a..e488d25e 100755
--- a/scripts/2-process/wikipedia_process.py
+++ b/scripts/2-process/wikipedia_process.py
@@ -28,6 +28,17 @@
 
 # Constants
 QUARTER = os.path.basename(PATHS["data_quarter"])
+FILE_PATHS = [
+    shared.path_join(
+        PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
+    ),
+    shared.path_join(
+        PATHS["data_phase"], "wikipedia_least_language_usage.csv"
+    ),
+    shared.path_join(
+        PATHS["data_phase"], "wikipedia_language_representation.csv"
+    ),
+]
 
 
 def parse_arguments():
@@ -63,7 +74,7 @@ def parse_arguments():
     return args
 
 
-def check_for_data_file(file_path):
+def check_for_data_files(file_path):
     if os.path.exists(file_path):
         raise shared.QuantifyingException(
             f"Processed data already exists for {QUARTER}", 0
@@ -98,7 +109,6 @@ def process_highest_language_usage(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, top_10, file_path)
 
 
@@ -122,7 +132,6 @@ def process_least_language_usage(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_least_language_usage.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, bottom_10, file_path)
 
 
@@ -149,7 +158,6 @@ def process_language_representation(args, count_data):
     file_path = shared.path_join(
         PATHS["data_phase"], "wikipedia_language_representation.csv"
     )
-    check_for_data_file(file_path)
     data_to_csv(args, language_counts, file_path)
 
 
@@ -157,6 +165,7 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
+    check_for_data_files(FILE_PATHS)
     file_count = shared.path_join(
         PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
     )
diff --git a/scripts/shared.py b/scripts/shared.py
index 8a3b69cf..db2a3ca7 100644
--- a/scripts/shared.py
+++ b/scripts/shared.py
@@ -283,14 +283,15 @@ def update_readme(
     image_caption,
     entry_text=None,
 ):
+    """
+    Update the README.md file with the generated images and descriptions.
+    """
     logger = args.logger
     paths = args.paths
     ordered_sections = section_order()
     logger.info("ordered_sections:", ordered_sections)
     logger.info("section_title:", repr(section_title))
-    """
-    Update the README.md file with the generated images and descriptions.
-    """
+
     if not args.enable_save:
         return
     if image_path and not image_caption:
@@ -336,8 +337,8 @@ def update_readme(
         # Sections that should come before this section
         sections_before = ordered_sections[:current_postion]
         # we find the last existing section that comes before this section
-        for prev_section in reversed(sections_before):
-            prev_end_line = f"<!-- section end {prev_section} -->\n"
+        for prev_section_title in reversed(sections_before):
+            prev_end_line = f"<!-- section end {prev_section_title} -->\n"
             if prev_end_line in lines:
                 insert_index = lines.index(prev_end_line) + 1
                 break

From 241634a90648c68150c9ad8c0d5fbcae8887cfc4 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Fri, 9 Jan 2026 16:37:38 +0100
Subject: [PATCH 09/10] Made review changes

---
 scripts/2-process/gcs_process.py       | 11 ++++++-----
 scripts/2-process/github_process.py    | 11 ++++++-----
 scripts/2-process/wikipedia_process.py | 11 ++++++-----
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
index 0ee57016..15501692 100755
--- a/scripts/2-process/gcs_process.py
+++ b/scripts/2-process/gcs_process.py
@@ -73,11 +73,12 @@ def parse_arguments():
     return args
 
 
-def check_for_data_files(file_path):
-    if os.path.exists(file_path):
-        raise shared.QuantifyingException(
-            f"Processed data already exists for {QUARTER}", 0
-        )
+def check_for_data_files(file_paths):
+    for path in file_paths:
+        if os.path.exists(path):
+            raise shared.QuantifyingException(
+                f"Processed data already exists for {QUARTER}", 0
+            )
 
 
 def data_to_csv(args, data, file_path):
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
index 192b864b..a43971f7 100755
--- a/scripts/2-process/github_process.py
+++ b/scripts/2-process/github_process.py
@@ -63,11 +63,12 @@ def parse_arguments():
     return args
 
 
-def check_for_data_files(file_path):
-    if os.path.exists(file_path):
-        raise shared.QuantifyingException(
-            f"Processed data already exists for {QUARTER}", 0
-        )
+def check_for_data_files(file_paths):
+    for path in file_paths:
+        if os.path.exists(path):
+            raise shared.QuantifyingException(
+                f"Processed data already exists for {QUARTER}", 0
+            )
 
 
 def data_to_csv(args, data, file_path):
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
index e488d25e..e0bbf85b 100755
--- a/scripts/2-process/wikipedia_process.py
+++ b/scripts/2-process/wikipedia_process.py
@@ -74,11 +74,12 @@ def parse_arguments():
     return args
 
 
-def check_for_data_files(file_path):
-    if os.path.exists(file_path):
-        raise shared.QuantifyingException(
-            f"Processed data already exists for {QUARTER}", 0
-        )
+def check_for_data_files(file_paths):
+    for path in file_paths:
+        if os.path.exists(path):
+            raise shared.QuantifyingException(
+                f"Processed data already exists for {QUARTER}", 0
+            )
 
 
 def data_to_csv(args, data, file_path):

From 8f4f0792b7de6d7cd1e63ab5e70f4b0f42afe199 Mon Sep 17 00:00:00 2001
From: Oreoluwa Oluwasina <oreoluwaoluwasina@gmail.com>
Date: Sat, 10 Jan 2026 16:56:53 +0100
Subject: [PATCH 10/10] Added the parser argument --force for regenerating
 processed files

---
 scripts/2-process/gcs_process.py       | 14 +++++++++-----
 scripts/2-process/github_process.py    | 12 +++++++++---
 scripts/2-process/wikipedia_process.py | 12 +++++++++---
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py
index 15501692..ab174a45 100755
--- a/scripts/2-process/gcs_process.py
+++ b/scripts/2-process/gcs_process.py
@@ -59,8 +59,12 @@ def parse_arguments():
     parser.add_argument(
         "--enable-git",
         action="store_true",
-        help="Enable git actions such as fetch, merge, add, commit, and push"
-        " (default: False)",
+        help="Enable git actions such as fetch, merge, add, commit, and push",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Regenerate data even if processed files already exist",
     )
     args = parser.parse_args()
     if not args.enable_save and args.enable_git:
@@ -73,9 +77,9 @@ def parse_arguments():
     return args
 
 
-def check_for_data_files(file_paths):
+def check_for_data_files(args, file_paths):
     for path in file_paths:
-        if os.path.exists(path):
+        if os.path.exists(path) and not args.force:
             raise shared.QuantifyingException(
                 f"Processed data already exists for {QUARTER}", 0
             )
@@ -327,7 +331,7 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
-    check_for_data_files(FILE_PATHS)
+    check_for_data_files(args, FILE_PATHS)
 
     # Count data
     file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py
index a43971f7..91110d66 100755
--- a/scripts/2-process/github_process.py
+++ b/scripts/2-process/github_process.py
@@ -52,6 +52,12 @@ def parse_arguments():
         help="Enable git actions such as fetch, merge, add, commit, and push"
         " (default: False)",
     )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Regenerate data even if processed files already exist",
+    )
+
     args = parser.parse_args()
     if not args.enable_save and args.enable_git:
         parser.error("--enable-git requires --enable-save")
@@ -63,9 +69,9 @@ def parse_arguments():
     return args
 
 
-def check_for_data_files(file_paths):
+def check_for_data_files(args, file_paths):
     for path in file_paths:
-        if os.path.exists(path):
+        if os.path.exists(path) and not args.force:
             raise shared.QuantifyingException(
                 f"Processed data already exists for {QUARTER}", 0
             )
@@ -144,7 +150,7 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
-    check_for_data_files(FILE_PATHS)
+    check_for_data_files(args, FILE_PATHS)
     file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
     count_data = shared.open_data_file(
         LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py
index e0bbf85b..b7c6b7a4 100755
--- a/scripts/2-process/wikipedia_process.py
+++ b/scripts/2-process/wikipedia_process.py
@@ -63,6 +63,12 @@ def parse_arguments():
         help="Enable git actions such as fetch, merge, add, commit, and push"
         " (default: False)",
     )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Regenerate data even if processed files already exist",
+    )
+
     args = parser.parse_args()
     if not args.enable_save and args.enable_git:
         parser.error("--enable-git requires --enable-save")
@@ -74,9 +80,9 @@ def parse_arguments():
     return args
 
 
-def check_for_data_files(file_paths):
+def check_for_data_files(args, file_paths):
     for path in file_paths:
-        if os.path.exists(path):
+        if os.path.exists(path) and not args.force:
             raise shared.QuantifyingException(
                 f"Processed data already exists for {QUARTER}", 0
             )
@@ -166,7 +172,7 @@ def main():
     args = parse_arguments()
     shared.paths_log(LOGGER, PATHS)
     shared.git_fetch_and_merge(args, PATHS["repo"])
-    check_for_data_files(FILE_PATHS)
+    check_for_data_files(args, FILE_PATHS)
     file_count = shared.path_join(
         PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
     )