From 7ff5a69d12e44467f2fa2f012c0747a0542ccfcd Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Wed, 7 Jan 2026 05:27:40 +0100 Subject: [PATCH 01/10] report_automation --- scripts/shared.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/shared.py b/scripts/shared.py index 509801d9..1c9b8b57 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -269,6 +269,9 @@ def setup(current_file): return logger, paths +# def section_order(): + + def update_readme( args, section_title, @@ -299,12 +302,12 @@ def update_readme( readme_path = path_join(paths["data"], args.quarter, "README.md") # Define section markers for each data source - section_start_line = f"\n" - section_end_line = f"\n" + section_start_line = f"\n" + section_end_line = f"\n" # Define entry markers for each plot (optional) and description - entry_start_line = f"\n" - entry_end_line = f"\n" + entry_start_line = f"\n" + entry_end_line = f"\n" if os.path.exists(readme_path): with open(readme_path, "r", encoding="utf-8") as f: From a7645a59b643bf2b3c62574386983496846807e9 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Wed, 7 Jan 2026 06:42:37 +0100 Subject: [PATCH 02/10] Add function for listing out section order --- .../{gcs_report.py => 01-gcs_report.py} | 2 +- .../{github_report.py => 02-github_report.py} | 2 +- scripts/3-report/03-openverse_report.py | 417 ++++++++++++++++++ ...pedia_report.py => 04-wikipedia_report.py} | 2 +- scripts/3-report/{notes.py => 100-notes.py} | 2 +- scripts/shared.py | 5 +- 6 files changed, 425 insertions(+), 5 deletions(-) rename scripts/3-report/{gcs_report.py => 01-gcs_report.py} (99%) mode change 100755 => 100644 rename scripts/3-report/{github_report.py => 02-github_report.py} (99%) mode change 100755 => 100644 create mode 100644 scripts/3-report/03-openverse_report.py rename scripts/3-report/{wikipedia_report.py => 04-wikipedia_report.py} (99%) mode change 100755 => 100644 rename scripts/3-report/{notes.py => 100-notes.py} (99%) mode change 100755 => 100644 diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/01-gcs_report.py old mode 100755 new mode 100644 similarity index 99% rename from scripts/3-report/gcs_report.py rename to scripts/3-report/01-gcs_report.py index 359796a9..17103561 --- a/scripts/3-report/gcs_report.py +++ b/scripts/3-report/01-gcs_report.py @@ -27,7 +27,7 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Google Custom Search (GCS)" +SECTION = "1-gcs_report.py" def parse_arguments(): diff --git a/scripts/3-report/github_report.py b/scripts/3-report/02-github_report.py old mode 100755 new mode 100644 similarity index 99% rename from scripts/3-report/github_report.py rename to scripts/3-report/02-github_report.py index 37979175..34a83b65 --- a/scripts/3-report/github_report.py +++ b/scripts/3-report/02-github_report.py @@ -25,7 +25,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "GitHub data" +SECTION = "2-github_report.py" def parse_arguments(): diff --git a/scripts/3-report/03-openverse_report.py b/scripts/3-report/03-openverse_report.py new file mode 100644 index 00000000..47d990a5 --- /dev/null +++ b/scripts/3-report/03-openverse_report.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python +""" +This file is dedicated to visualizing and analyzing the data collected +from Openverse. +""" +# Standard library +import argparse +import os +import sys +import textwrap +import traceback + +# Third-party +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import plot # noqa: E402 +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) +QUARTER = os.path.basename(PATHS["data_quarter"]) +SECTION = "3-openverse_report.py" + + +def parse_arguments(): + """ + Parses command-line arguments, returns parsed arguments. + """ + LOGGER.info("Parsing command-line arguments") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--show-plots", + action="store_true", + help="Show generated plots (default: False)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def openverse_intro(args): + """ + Write Openverse Introduction. + """ + LOGGER.info(openverse_intro.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_1-fetch"], + "openverse_fetch.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "TOOL_IDENTIFIER" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + total = data["MEDIA_COUNT"].sum() + media_counts = data.groupby("MEDIA_TYPE")["MEDIA_COUNT"].sum() + total_media = media_counts.sum() + audio_percentage = ( + f"{(media_counts.get('audio', 0) / total_media) * 100:.2f}" + ) + images_percentage = ( + f"{(media_counts.get('images', 0) / total_media) * 100:.2f}" + ) + unique_sources = data["SOURCE"].nunique() + shared.update_readme( + args, + SECTION, + "Overview", + None, + None, + "The Openverse data, below, uses the `Media_count field`" + " returned by API for search queries of the various legal tools." + "\n" + f" The results indicate that there are {total} count of audio" + " and images that are licensed or put in the" + " public domain using a Creative Commons (CC) legal tool." + " They respectively take a percentage of" + f" {audio_percentage} and {images_percentage}," + " of the total media count returned by the Openverse API." + "\n" + f"There are {unique_sources} count of" + f" data sources under the openverse API.\n" + "\n" + "Thank you Openverse for providing a public API" + " access to its media metadata!", + ) + + +def plot_totals_by_license_type(args): + """ + Create plots showing totals by license type + """ + LOGGER.info(plot_totals_by_license_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_license.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "License" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(data_label, ascending=True, inplace=True) + title = "Totals by license type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_license_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool totals and" + " percentages.", + ) + + +def plot_totals_by_media_type(args): + """ + Create plots showing totals by media type + """ + LOGGER.info(plot_totals_by_media_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_media_type.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Media_type" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + title = "Totals by media_type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_media_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool" + " totals by each media type", + ) + + +def plot_totals_by_sources(args): + """ + Create plots showing totals by sources + """ + LOGGER.info(plot_totals_by_sources.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_sources.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Source" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + top_10 = data.head(10) + title = "Totals by sources" + plt = plot.combined_plot( + args=args, + data=top_10, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join(PATHS["data_phase"], "openverse_sources.png") + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool totals" + " across the top 10 sources returned by openverse API.", + ) + + +def plot_permissive_by_media_type(args): + """ + Create plots showing the count of permissive content by media type + """ + LOGGER.info(plot_permissive_by_media_type.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_permissive_by_media_type.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Media_type" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + title = "Permissive content by media type" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_media_type.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing count of permissive content by media type.", + ) + + +def plot_permissive_by_source(args): + """ + Create plots showing count of permissive content by source + """ + LOGGER.info(plot_permissive_by_source.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_permissive_by_source.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Source" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=True, inplace=True) + top_10 = data.head(10) + title = "Permissive by source" + plt = plot.combined_plot( + args=args, + data=top_10, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_source.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing count of permissive content" + " by top 10 sources in openverse.", + ) + + +def plot_totals_by_restriction(args): + """ + Create plots showing totals by restriction + """ + LOGGER.info(plot_totals_by_restriction.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "openverse_totals_by_restriction.csv", + ) + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = shared.open_data_file(LOGGER, file_path, index_col=name_label) + data.sort_values(name_label, ascending=False, inplace=True) + title = "Totals by restriction" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "openverse_restriction.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing totals by different levels of rights reserved" + " on openverse media contents." + " This shows the distribution of Public domain," + " Permissive, Copyleft and restricted" + " licenses used in Openverse media contents.", + ) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + openverse_intro(args) + plot_totals_by_license_type(args) + plot_totals_by_media_type(args) + plot_permissive_by_media_type(args) + plot_permissive_by_source(args) + plot_totals_by_restriction(args) + + # Add and commit changes + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit Openverse reports for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/04-wikipedia_report.py old mode 100755 new mode 100644 similarity index 99% rename from scripts/3-report/wikipedia_report.py rename to scripts/3-report/04-wikipedia_report.py index 83a92fa3..7957c26f --- a/scripts/3-report/wikipedia_report.py +++ b/scripts/3-report/04-wikipedia_report.py @@ -25,7 +25,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Wikipedia data" +SECTION = "4-wikipedia_report.py" def parse_arguments(): diff --git a/scripts/3-report/notes.py b/scripts/3-report/100-notes.py old mode 100755 new mode 100644 similarity index 99% rename from scripts/3-report/notes.py rename to scripts/3-report/100-notes.py index ccefd058..97cd8a4d --- a/scripts/3-report/notes.py +++ b/scripts/3-report/100-notes.py @@ -25,7 +25,7 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Notes" +SECTION = "100-notes.py" def parse_arguments(): diff --git a/scripts/shared.py b/scripts/shared.py index 1c9b8b57..2b28c854 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -269,7 +269,10 @@ def setup(current_file): return logger, paths -# def section_order(): +def section_order(): + report_dir = os.path.join(os.path.dirname(__file__), ".") + report_files = os.listdir(report_dir) + return report_files def update_readme( From 297dc8162cb300c68a4dcbbdfce307ee7359bd55 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Wed, 7 Jan 2026 21:56:39 +0100 Subject: [PATCH 03/10] process_script --- scripts/2-process/openverse_process.py | 277 +++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 scripts/2-process/openverse_process.py diff --git a/scripts/2-process/openverse_process.py b/scripts/2-process/openverse_process.py new file mode 100644 index 00000000..e660b7b8 --- /dev/null +++ b/scripts/2-process/openverse_process.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python +""" +This file is dedicated to processing Openverse data +for analysis and comparison between quarters. +""" +# Standard library +import argparse +import csv +import os +import sys +import traceback +from collections import defaultdict + +# Third-party +import pandas as pd + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def check_for_data_file(file_path): + if os.path.exists(file_path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + +def data_to_csv(args, data, file_path): + if not args.enable_save: + return + os.makedirs(PATHS["data_phase"], exist_ok=True) + # emulate csv.unix_dialect + data.to_csv( + file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" + ) + + +def process_totals_by_license(args, count_data): + """ + Processing count data: totals by license + """ + LOGGER.info(process_totals_by_license.__doc__.strip()) + data = defaultdict(int) + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + count = int(row.MEDIA_COUNT) + + data[tool] += count + data = pd.DataFrame(data.items(), columns=["License", "Count"]) + data.sort_values("License", ascending=True, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_license.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_totals_by_media_type(args, count_data): + """ + Processing count data: totals by media type + """ + + LOGGER.info(process_totals_by_media_type.__doc__.strip()) + data = defaultdict(int) + + for row in count_data.itertuples(index=False): + media_type = str(row.MEDIA_TYPE) + count = int(row.MEDIA_COUNT) + + data[media_type] += count + data = pd.DataFrame(data.items(), columns=["Media_type", "Count"]) + data.sort_values("Media_type", ascending=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_media_type.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_totals_by_source(args, count_data): + """ + Processing count data: totals by source + """ + LOGGER.info(process_totals_by_source.__doc__.strip()) + data = defaultdict(int) + for row in count_data.itertuples(index=False): + source = str(row.SOURCE) + count = int(row.MEDIA_COUNT) + + data[source] += count + data = pd.DataFrame(data.items(), columns=["Source", "Count"]) + data.sort_values("Source", ascending=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_source.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_permissive_by_media_type(args, count_data): + """ + Processing count data: permissive by media type + """ + LOGGER.info(process_permissive_by_media_type.__doc__.strip()) + + data = defaultdict(int) + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + media_type = str(row.MEDIA_TYPE) + count = int(row.MEDIA_COUNT) + + if tool in ["CC0", "CC BY", "CC BY-SA"]: + data[media_type] += count + + data = pd.DataFrame(data.items(), columns=["Media_type", "Count"]) + data.sort_values("Media_type", ascending=True, inplace=True) + + file_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_media_type.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_permissive_by_source(args, count_data): + """ + Processing count data: permissive content by source + """ + LOGGER.info(process_permissive_by_source.__doc__.strip()) + data = defaultdict(int) + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + source = str(row.SOURCE) + count = int(row.MEDIA_COUNT) + if tool in ["CC0", "CC BY", "CC BY-SA"]: + data[source] += count + data = pd.DataFrame(data.items(), columns=["Source", "Count"]) + data.sort_values("Source", ascending=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "openverse_permissive_by_source.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def process_totals_by_restriction(args, count_data): + """ + Processing count data: totals by restriction + """ + # https://creativecommons.org/public-domain/freeworks/ + LOGGER.info(process_totals_by_restriction.__doc__.strip()) + + data = { + "Copyleft": 0, + "Permissive": 0, + "Public domain": 0, + "Restricted": 0, + } + + for row in count_data.itertuples(index=False): + tool = str(row.TOOL_IDENTIFIER) + count = int(row.MEDIA_COUNT) + + if tool in ["CC0", "PDM"]: + key = "Public domain" + + elif tool in ["CC BY"]: + key = "Permissive" + + elif tool in ["CC BY-SA"]: + key = "Copyleft" + + else: + key = "Restricted" + + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Category", ascending=True, inplace=True) + + file_path = shared.path_join( + PATHS["data_phase"], "openverse_totals_by_restriction.csv" + ) + check_for_data_file(file_path) + data_to_csv(args, data, file_path) + + +def main(): + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv") + count_data = shared.open_data_file( + LOGGER, + file_count, + usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"], + ) + process_totals_by_license(args, count_data) + process_totals_by_media_type(args, count_data) + process_totals_by_source(args, count_data) + process_permissive_by_media_type(args, count_data) + process_permissive_by_source(args, count_data) + process_totals_by_restriction(args, count_data) + # Push changes + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new GitHub data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.code) + except SystemExit as e: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") + sys.exit(1) From 9f8ffd8cca5efac10e91df7f0a453f9556c57543 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Wed, 7 Jan 2026 22:00:25 +0100 Subject: [PATCH 04/10] made review changes --- scripts/3-report/{01-gcs_report.py => gcs_report.py} | 3 ++- scripts/3-report/{02-github_report.py => github_report.py} | 3 ++- .../3-report/{03-openverse_report.py => openverse_report.py} | 3 ++- .../3-report/{04-wikipedia_report.py => wikipedia_report.py} | 3 ++- scripts/3-report/{100-notes.py => zzz-notes.py} | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) rename scripts/3-report/{01-gcs_report.py => gcs_report.py} (99%) rename scripts/3-report/{02-github_report.py => github_report.py} (99%) rename scripts/3-report/{03-openverse_report.py => openverse_report.py} (99%) rename scripts/3-report/{04-wikipedia_report.py => wikipedia_report.py} (99%) rename scripts/3-report/{100-notes.py => zzz-notes.py} (98%) diff --git a/scripts/3-report/01-gcs_report.py b/scripts/3-report/gcs_report.py similarity index 99% rename from scripts/3-report/01-gcs_report.py rename to scripts/3-report/gcs_report.py index 17103561..eb2a4581 100644 --- a/scripts/3-report/01-gcs_report.py +++ b/scripts/3-report/gcs_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -27,7 +28,7 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "1-gcs_report.py" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/02-github_report.py b/scripts/3-report/github_report.py similarity index 99% rename from scripts/3-report/02-github_report.py rename to scripts/3-report/github_report.py index 34a83b65..b2a18890 100644 --- a/scripts/3-report/02-github_report.py +++ b/scripts/3-report/github_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "2-github_report.py" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/03-openverse_report.py b/scripts/3-report/openverse_report.py similarity index 99% rename from scripts/3-report/03-openverse_report.py rename to scripts/3-report/openverse_report.py index 47d990a5..2347ae28 100644 --- a/scripts/3-report/03-openverse_report.py +++ b/scripts/3-report/openverse_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "3-openverse_report.py" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/04-wikipedia_report.py b/scripts/3-report/wikipedia_report.py similarity index 99% rename from scripts/3-report/04-wikipedia_report.py rename to scripts/3-report/wikipedia_report.py index 7957c26f..96f0cf46 100644 --- a/scripts/3-report/04-wikipedia_report.py +++ b/scripts/3-report/wikipedia_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "4-wikipedia_report.py" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/100-notes.py b/scripts/3-report/zzz-notes.py similarity index 98% rename from scripts/3-report/100-notes.py rename to scripts/3-report/zzz-notes.py index 97cd8a4d..d9d4aa93 100644 --- a/scripts/3-report/100-notes.py +++ b/scripts/3-report/zzz-notes.py @@ -8,6 +8,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "100-notes.py" +SECTION = Path(__file__).name def parse_arguments(): From 16cb9c9839fa6589e2eb4cb5e978a1eef9ea0214 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Thu, 8 Jan 2026 13:43:40 +0100 Subject: [PATCH 05/10] made new changes --- scripts/shared.py | 55 ++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/scripts/shared.py b/scripts/shared.py index 2b28c854..8a3b69cf 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -270,7 +270,7 @@ def setup(current_file): def section_order(): - report_dir = os.path.join(os.path.dirname(__file__), ".") + report_dir = os.path.join(os.path.dirname(__file__), "3-report") report_files = os.listdir(report_dir) return report_files @@ -283,6 +283,11 @@ def update_readme( image_caption, entry_text=None, ): + logger = args.logger + paths = args.paths + ordered_sections = section_order() + logger.info("ordered_sections:", ordered_sections) + logger.info("section_title:", repr(section_title)) """ Update the README.md file with the generated images and descriptions. """ @@ -299,9 +304,6 @@ def update_readme( " caption is provided" ) - logger = args.logger - paths = args.paths - readme_path = path_join(paths["data"], args.quarter, "README.md") # Define section markers for each data source @@ -324,26 +326,39 @@ def update_readme( lines.insert(0, title_line) lines.insert(1, "\n") - # We only need to know the position of the end to append new entries + # Locate the data source section if it is already present if section_start_line in lines: - # Locate the data source section if it is already present section_end_index = lines.index(section_end_line) else: - # Add the data source section if it is absent - lines.extend( - [ - f"{section_start_line}", - "\n", - "\n", - f"## {section_title}\n", - "\n", - "\n", - f"{section_end_line}", - "\n", - ] - ) - section_end_index = lines.index(section_end_line) + insert_index = None + # If not present, we find the position to insert the section + current_postion = ordered_sections.index(section_title) + # Sections that should come before this section + sections_before = ordered_sections[:current_postion] + # we find the last existing section that comes before this section + for prev_section in reversed(sections_before): + prev_end_line = f"\n" + if prev_end_line in lines: + insert_index = lines.index(prev_end_line) + 1 + break + # If none exist, insert at the top (after README title) + if insert_index is None: + insert_index = 2 if len(lines) >= 2 else len(lines) + # Insert the new data source section at correct position + new_section_line = [ + f"{section_start_line}", + "\n", + "\n", + f"## {section_title}\n", + "\n", + "\n", + f"{section_end_line}", + "\n", + ] + # Insert the section at the correct position + lines = lines[:insert_index] + new_section_line + lines[insert_index:] + section_end_index = lines.index(section_end_line) # Locate the entry if it is already present if entry_start_line in lines: entry_start_index = lines.index(entry_start_line) From 5ae1584477209577133f88b1242d0d097263a44d Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Thu, 8 Jan 2026 18:30:53 +0100 Subject: [PATCH 06/10] Added check_for_data_file function for gcs_process.py --- scripts/2-process/gcs_process.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index fefbba0f..786d8329 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -62,6 +62,13 @@ def parse_arguments(): return args +def check_for_data_file(file_path): + if os.path.exists(file_path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + def data_to_csv(args, data, file_path): if not args.enable_save: return @@ -111,6 +118,7 @@ def process_product_totals(args, count_data): data.items(), columns=["CC legal tool product", "Count"] ) file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv") + check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -192,7 +200,8 @@ def process_latest_prior_retired_totals(args, count_data): file_path = shared.path_join( PATHS["data_phase"], f"gcs_status_{key}_totals.csv" ) - data_to_csv(args, dataframe, file_path) + check_for_data_file(file_path) + data_to_csv(args, dataframe, file_path) def process_totals_by_free_cultural(args, count_data): @@ -225,6 +234,7 @@ def process_totals_by_free_cultural(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_free_cultural.csv" ) + check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -259,6 +269,7 @@ def process_totals_by_restrictions(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_restrictions.csv" ) + check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -280,6 +291,7 @@ def process_totals_by_language(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_language.csv" ) + check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -301,6 +313,7 @@ def process_totals_by_country(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_country.csv" ) + check_for_data_file(file_path) data_to_csv(args, data, file_path) From 80b647501933bbd089993bb822b64c7b00dafaae Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Fri, 9 Jan 2026 13:53:32 +0100 Subject: [PATCH 07/10] Remove openverse process and report --- scripts/2-process/openverse_process.py | 277 ---------------- scripts/3-report/openverse_report.py | 418 ------------------------- 2 files changed, 695 deletions(-) delete mode 100644 scripts/2-process/openverse_process.py delete mode 100644 scripts/3-report/openverse_report.py diff --git a/scripts/2-process/openverse_process.py b/scripts/2-process/openverse_process.py deleted file mode 100644 index e660b7b8..00000000 --- a/scripts/2-process/openverse_process.py +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to processing Openverse data -for analysis and comparison between quarters. -""" -# Standard library -import argparse -import csv -import os -import sys -import traceback -from collections import defaultdict - -# Third-party -import pandas as pd - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) - -# Constants -QUARTER = os.path.basename(PATHS["data_quarter"]) - - -def parse_arguments(): - """ - Parse command-line options, returns parsed argument namespace. - """ - LOGGER.info("Parsing command-line options") - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--quarter", - default=QUARTER, - help=f"Data quarter in format YYYYQx (default: {QUARTER})", - ) - parser.add_argument( - "--enable-save", - action="store_true", - help="Enable saving results (default: False)", - ) - parser.add_argument( - "--enable-git", - action="store_true", - help="Enable git actions such as fetch, merge, add, commit, and push" - " (default: False)", - ) - args = parser.parse_args() - if not args.enable_save and args.enable_git: - parser.error("--enable-git requires --enable-save") - if args.quarter != QUARTER: - global PATHS - PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) - args.logger = LOGGER - args.paths = PATHS - return args - - -def check_for_data_file(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) - - -def data_to_csv(args, data, file_path): - if not args.enable_save: - return - os.makedirs(PATHS["data_phase"], exist_ok=True) - # emulate csv.unix_dialect - data.to_csv( - file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" - ) - - -def process_totals_by_license(args, count_data): - """ - Processing count data: totals by license - """ - LOGGER.info(process_totals_by_license.__doc__.strip()) - data = defaultdict(int) - - for row in count_data.itertuples(index=False): - tool = str(row.TOOL_IDENTIFIER) - count = int(row.MEDIA_COUNT) - - data[tool] += count - data = pd.DataFrame(data.items(), columns=["License", "Count"]) - data.sort_values("License", ascending=True, inplace=True) - data.reset_index(drop=True, inplace=True) - file_path = shared.path_join( - PATHS["data_phase"], "openverse_totals_by_license.csv" - ) - check_for_data_file(file_path) - data_to_csv(args, data, file_path) - - -def process_totals_by_media_type(args, count_data): - """ - Processing count data: totals by media type - """ - - LOGGER.info(process_totals_by_media_type.__doc__.strip()) - data = defaultdict(int) - - for row in count_data.itertuples(index=False): - media_type = str(row.MEDIA_TYPE) - count = int(row.MEDIA_COUNT) - - data[media_type] += count - data = pd.DataFrame(data.items(), columns=["Media_type", "Count"]) - data.sort_values("Media_type", ascending=True, inplace=True) - file_path = shared.path_join( - PATHS["data_phase"], "openverse_totals_by_media_type.csv" - ) - check_for_data_file(file_path) - data_to_csv(args, data, file_path) - - -def process_totals_by_source(args, count_data): - """ - Processing count data: totals by source - """ - LOGGER.info(process_totals_by_source.__doc__.strip()) - data = defaultdict(int) - for row in count_data.itertuples(index=False): - source = str(row.SOURCE) - count = int(row.MEDIA_COUNT) - - data[source] += count - data = pd.DataFrame(data.items(), columns=["Source", "Count"]) - data.sort_values("Source", ascending=True, inplace=True) - file_path = shared.path_join( - PATHS["data_phase"], "openverse_totals_by_source.csv" - ) - check_for_data_file(file_path) - data_to_csv(args, data, file_path) - - -def process_permissive_by_media_type(args, count_data): - """ - Processing count data: permissive by media type - """ - LOGGER.info(process_permissive_by_media_type.__doc__.strip()) - - data = defaultdict(int) - - for row in count_data.itertuples(index=False): - tool = str(row.TOOL_IDENTIFIER) - media_type = str(row.MEDIA_TYPE) - count = int(row.MEDIA_COUNT) - - if tool in ["CC0", "CC BY", "CC BY-SA"]: - data[media_type] += count - - data = pd.DataFrame(data.items(), columns=["Media_type", "Count"]) - data.sort_values("Media_type", ascending=True, inplace=True) - - file_path = shared.path_join( - PATHS["data_phase"], "openverse_permissive_by_media_type.csv" - ) - check_for_data_file(file_path) - data_to_csv(args, data, file_path) - - -def process_permissive_by_source(args, count_data): - """ - Processing count data: permissive content by source - """ - LOGGER.info(process_permissive_by_source.__doc__.strip()) - data = defaultdict(int) - for row in count_data.itertuples(index=False): - tool = str(row.TOOL_IDENTIFIER) - source = str(row.SOURCE) - count = int(row.MEDIA_COUNT) - if tool in ["CC0", "CC BY", "CC BY-SA"]: - data[source] += count - data = pd.DataFrame(data.items(), columns=["Source", "Count"]) - data.sort_values("Source", ascending=True, inplace=True) - file_path = shared.path_join( - PATHS["data_phase"], "openverse_permissive_by_source.csv" - ) - check_for_data_file(file_path) - data_to_csv(args, data, file_path) - - -def process_totals_by_restriction(args, count_data): - """ - Processing count data: totals by restriction - """ - # https://creativecommons.org/public-domain/freeworks/ - LOGGER.info(process_totals_by_restriction.__doc__.strip()) - - data = { - "Copyleft": 0, - "Permissive": 0, - "Public domain": 0, - "Restricted": 0, - } - - for row in count_data.itertuples(index=False): - tool = str(row.TOOL_IDENTIFIER) - count = int(row.MEDIA_COUNT) - - if tool in ["CC0", "PDM"]: - key = "Public domain" - - elif tool in ["CC BY"]: - key = "Permissive" - - elif tool in ["CC BY-SA"]: - key = "Copyleft" - - else: - key = "Restricted" - - data[key] += count - - data = pd.DataFrame(data.items(), columns=["Category", "Count"]) - data.sort_values("Category", ascending=True, inplace=True) - - file_path = shared.path_join( - PATHS["data_phase"], "openverse_totals_by_restriction.csv" - ) - check_for_data_file(file_path) - data_to_csv(args, data, file_path) - - -def main(): - args = parse_arguments() - shared.paths_log(LOGGER, PATHS) - shared.git_fetch_and_merge(args, PATHS["repo"]) - - file_count = shared.path_join(PATHS["data_1-fetch"], "openverse_fetch.csv") - count_data = shared.open_data_file( - LOGGER, - file_count, - usecols=["SOURCE", "MEDIA_TYPE", "TOOL_IDENTIFIER", "MEDIA_COUNT"], - ) - process_totals_by_license(args, count_data) - process_totals_by_media_type(args, count_data) - process_totals_by_source(args, count_data) - process_permissive_by_media_type(args, count_data) - process_permissive_by_source(args, count_data) - process_totals_by_restriction(args, count_data) - # Push changes - args = shared.git_add_and_commit( - args, - PATHS["repo"], - PATHS["data_quarter"], - f"Add and commit new GitHub data for {QUARTER}", - ) - shared.git_push_changes(args, PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.code) - except SystemExit as e: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - LOGGER.exception(f"(1) Unhandled exception: {traceback.format_exc()}") - sys.exit(1) diff --git a/scripts/3-report/openverse_report.py b/scripts/3-report/openverse_report.py deleted file mode 100644 index 2347ae28..00000000 --- a/scripts/3-report/openverse_report.py +++ /dev/null @@ -1,418 +0,0 @@ -#!/usr/bin/env python -""" -This file is dedicated to visualizing and analyzing the data collected -from Openverse. -""" -# Standard library -import argparse -import os -import sys -import textwrap -import traceback -from pathlib import Path - -# Third-party -from pygments import highlight -from pygments.formatters import TerminalFormatter -from pygments.lexers import PythonTracebackLexer - -# Add parent directory so shared can be imported -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) - -# First-party/Local -import plot # noqa: E402 -import shared # noqa: E402 - -# Setup -LOGGER, PATHS = shared.setup(__file__) -QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = Path(__file__).name - - -def parse_arguments(): - """ - Parses command-line arguments, returns parsed arguments. - """ - LOGGER.info("Parsing command-line arguments") - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--quarter", - default=QUARTER, - help=f"Data quarter in format YYYYQx (default: {QUARTER})", - ) - parser.add_argument( - "--show-plots", - action="store_true", - help="Show generated plots (default: False)", - ) - parser.add_argument( - "--enable-save", - action="store_true", - help="Enable saving results (default: False)", - ) - parser.add_argument( - "--enable-git", - action="store_true", - help="Enable git actions such as fetch, merge, add, commit, and push" - " (default: False)", - ) - args = parser.parse_args() - if not args.enable_save and args.enable_git: - parser.error("--enable-git requires --enable-save") - if args.quarter != QUARTER: - global PATHS - PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) - args.logger = LOGGER - args.paths = PATHS - return args - - -def openverse_intro(args): - """ - Write Openverse Introduction. - """ - LOGGER.info(openverse_intro.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_1-fetch"], - "openverse_fetch.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "TOOL_IDENTIFIER" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - total = data["MEDIA_COUNT"].sum() - media_counts = data.groupby("MEDIA_TYPE")["MEDIA_COUNT"].sum() - total_media = media_counts.sum() - audio_percentage = ( - f"{(media_counts.get('audio', 0) / total_media) * 100:.2f}" - ) - images_percentage = ( - f"{(media_counts.get('images', 0) / total_media) * 100:.2f}" - ) - unique_sources = data["SOURCE"].nunique() - shared.update_readme( - args, - SECTION, - "Overview", - None, - None, - "The Openverse data, below, uses the `Media_count field`" - " returned by API for search queries of the various legal tools." - "\n" - f" The results indicate that there are {total} count of audio" - " and images that are licensed or put in the" - " public domain using a Creative Commons (CC) legal tool." - " They respectively take a percentage of" - f" {audio_percentage} and {images_percentage}," - " of the total media count returned by the Openverse API." - "\n" - f"There are {unique_sources} count of" - f" data sources under the openverse API.\n" - "\n" - "Thank you Openverse for providing a public API" - " access to its media metadata!", - ) - - -def plot_totals_by_license_type(args): - """ - Create plots showing totals by license type - """ - LOGGER.info(plot_totals_by_license_type.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "openverse_totals_by_license.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "License" - data_label = "Count" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - data.sort_values(data_label, ascending=True, inplace=True) - title = "Totals by license type" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join( - PATHS["data_phase"], "openverse_totals_by_license_type.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing Creative Commons (CC) legal tool totals and" - " percentages.", - ) - - -def plot_totals_by_media_type(args): - """ - Create plots showing totals by media type - """ - LOGGER.info(plot_totals_by_media_type.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "openverse_totals_by_media_type.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Media_type" - data_label = "Count" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - title = "Totals by media_type" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join( - PATHS["data_phase"], "openverse_totals_by_media_type.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing Creative Commons (CC) legal tool" - " totals by each media type", - ) - - -def plot_totals_by_sources(args): - """ - Create plots showing totals by sources - """ - LOGGER.info(plot_totals_by_sources.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "openverse_totals_by_sources.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Source" - data_label = "Count" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - top_10 = data.head(10) - title = "Totals by sources" - plt = plot.combined_plot( - args=args, - data=top_10, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join(PATHS["data_phase"], "openverse_sources.png") - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing Creative Commons (CC) legal tool totals" - " across the top 10 sources returned by openverse API.", - ) - - -def plot_permissive_by_media_type(args): - """ - Create plots showing the count of permissive content by media type - """ - LOGGER.info(plot_permissive_by_media_type.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "openverse_permissive_by_media_type.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Media_type" - data_label = "Count" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - title = "Permissive content by media type" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join( - PATHS["data_phase"], "openverse_permissive_by_media_type.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing count of permissive content by media type.", - ) - - -def plot_permissive_by_source(args): - """ - Create plots showing count of permissive content by source - """ - LOGGER.info(plot_permissive_by_source.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "openverse_permissive_by_source.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Source" - data_label = "Count" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - data.sort_values(name_label, ascending=True, inplace=True) - top_10 = data.head(10) - title = "Permissive by source" - plt = plot.combined_plot( - args=args, - data=top_10, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join( - PATHS["data_phase"], "openverse_permissive_by_source.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing count of permissive content" - " by top 10 sources in openverse.", - ) - - -def plot_totals_by_restriction(args): - """ - Create plots showing totals by restriction - """ - LOGGER.info(plot_totals_by_restriction.__doc__.strip()) - file_path = shared.path_join( - PATHS["data_2-process"], - "openverse_totals_by_restriction.csv", - ) - LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") - name_label = "Category" - data_label = "Count" - data = shared.open_data_file(LOGGER, file_path, index_col=name_label) - data.sort_values(name_label, ascending=False, inplace=True) - title = "Totals by restriction" - plt = plot.combined_plot( - args=args, - data=data, - title=title, - name_label=name_label, - data_label=data_label, - ) - - image_path = shared.path_join( - PATHS["data_phase"], "openverse_restriction.png" - ) - LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") - if args.enable_save: - # Create the directory if it does not exist - os.makedirs(PATHS["data_phase"], exist_ok=True) - plt.savefig(image_path) - - shared.update_readme( - args, - SECTION, - title, - image_path, - "Plots showing totals by different levels of rights reserved" - " on openverse media contents." - " This shows the distribution of Public domain," - " Permissive, Copyleft and restricted" - " licenses used in Openverse media contents.", - ) - - -def main(): - args = parse_arguments() - shared.paths_log(LOGGER, PATHS) - shared.git_fetch_and_merge(args, PATHS["repo"]) - openverse_intro(args) - plot_totals_by_license_type(args) - plot_totals_by_media_type(args) - plot_permissive_by_media_type(args) - plot_permissive_by_source(args) - plot_totals_by_restriction(args) - - # Add and commit changes - args = shared.git_add_and_commit( - args, - PATHS["repo"], - PATHS["data_quarter"], - f"Add and commit Openverse reports for {QUARTER}", - ) - shared.git_push_changes(args, PATHS["repo"]) - - -if __name__ == "__main__": - try: - main() - except shared.QuantifyingException as e: - if e.exit_code == 0: - LOGGER.info(e.message) - else: - LOGGER.error(e.message) - sys.exit(e.exit_code) - except SystemExit as e: - if e.code != 0: - LOGGER.error(f"System exit with code: {e.code}") - sys.exit(e.code) - except KeyboardInterrupt: - LOGGER.info("(130) Halted via KeyboardInterrupt.") - sys.exit(130) - except Exception: - traceback_formatted = textwrap.indent( - highlight( - traceback.format_exc(), - PythonTracebackLexer(), - TerminalFormatter(), - ), - " ", - ) - LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") - sys.exit(1) From 7eba804e508f7972cd42ba4d9ab08a2dab6aed3e Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Fri, 9 Jan 2026 15:25:05 +0100 Subject: [PATCH 08/10] Made review changes --- scripts/2-process/gcs_process.py | 22 ++++++++++++++-------- scripts/2-process/github_process.py | 10 ++++++---- scripts/2-process/wikipedia_process.py | 17 +++++++++++++---- scripts/shared.py | 11 ++++++----- 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 786d8329..0ee57016 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -27,6 +27,17 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"), +] def parse_arguments(): @@ -62,7 +73,7 @@ def parse_arguments(): return args -def check_for_data_file(file_path): +def check_for_data_files(file_path): if os.path.exists(file_path): raise shared.QuantifyingException( f"Processed data already exists for {QUARTER}", 0 @@ -118,7 +129,6 @@ def process_product_totals(args, count_data): data.items(), columns=["CC legal tool product", "Count"] ) file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv") - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -200,8 +210,7 @@ def process_latest_prior_retired_totals(args, count_data): file_path = shared.path_join( PATHS["data_phase"], f"gcs_status_{key}_totals.csv" ) - check_for_data_file(file_path) - data_to_csv(args, dataframe, file_path) + data_to_csv(args, dataframe, file_path) def process_totals_by_free_cultural(args, count_data): @@ -234,7 +243,6 @@ def process_totals_by_free_cultural(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_free_cultural.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -269,7 +277,6 @@ def process_totals_by_restrictions(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_restrictions.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -291,7 +298,6 @@ def process_totals_by_language(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_language.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -313,7 +319,6 @@ def process_totals_by_country(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_country.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -321,6 +326,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) + check_for_data_files(FILE_PATHS) # Count data file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 27945613..192b864b 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -24,6 +24,10 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"), + shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"), +] def parse_arguments(): @@ -59,7 +63,7 @@ def parse_arguments(): return args -def check_for_data_file(file_path): +def check_for_data_files(file_path): if os.path.exists(file_path): raise shared.QuantifyingException( f"Processed data already exists for {QUARTER}", 0 @@ -98,7 +102,6 @@ def process_totals_by_license(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_license.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -133,7 +136,6 @@ def process_totals_by_restriction(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_restriction.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -141,7 +143,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - + check_for_data_files(FILE_PATHS) file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv") count_data = shared.open_data_file( LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"] diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 7712b26a..e488d25e 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -28,6 +28,17 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join( + PATHS["data_phase"], "wikipedia_highest_language_usage.csv" + ), + shared.path_join( + PATHS["data_phase"], "wikipedia_least_language_usage.csv" + ), + shared.path_join( + PATHS["data_phase"], "wikipedia_language_representation.csv" + ), +] def parse_arguments(): @@ -63,7 +74,7 @@ def parse_arguments(): return args -def check_for_data_file(file_path): +def check_for_data_files(file_path): if os.path.exists(file_path): raise shared.QuantifyingException( f"Processed data already exists for {QUARTER}", 0 @@ -98,7 +109,6 @@ def process_highest_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) - check_for_data_file(file_path) data_to_csv(args, top_10, file_path) @@ -122,7 +132,6 @@ def process_least_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) - check_for_data_file(file_path) data_to_csv(args, bottom_10, file_path) @@ -149,7 +158,6 @@ def process_language_representation(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_language_representation.csv" ) - check_for_data_file(file_path) data_to_csv(args, language_counts, file_path) @@ -157,6 +165,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) + check_for_data_files(FILE_PATHS) file_count = shared.path_join( PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv" ) diff --git a/scripts/shared.py b/scripts/shared.py index 8a3b69cf..db2a3ca7 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -283,14 +283,15 @@ def update_readme( image_caption, entry_text=None, ): + """ + Update the README.md file with the generated images and descriptions. + """ logger = args.logger paths = args.paths ordered_sections = section_order() logger.info("ordered_sections:", ordered_sections) logger.info("section_title:", repr(section_title)) - """ - Update the README.md file with the generated images and descriptions. - """ + if not args.enable_save: return if image_path and not image_caption: @@ -336,8 +337,8 @@ def update_readme( # Sections that should come before this section sections_before = ordered_sections[:current_postion] # we find the last existing section that comes before this section - for prev_section in reversed(sections_before): - prev_end_line = f"\n" + for prev_section_title in reversed(sections_before): + prev_end_line = f"\n" if prev_end_line in lines: insert_index = lines.index(prev_end_line) + 1 break From 241634a90648c68150c9ad8c0d5fbcae8887cfc4 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Fri, 9 Jan 2026 16:37:38 +0100 Subject: [PATCH 09/10] Made review changes --- scripts/2-process/gcs_process.py | 11 ++++++----- scripts/2-process/github_process.py | 11 ++++++----- scripts/2-process/wikipedia_process.py | 11 ++++++----- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 0ee57016..15501692 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -73,11 +73,12 @@ def parse_arguments(): return args -def check_for_data_files(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) +def check_for_data_files(file_paths): + for path in file_paths: + if os.path.exists(path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) def data_to_csv(args, data, file_path): diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 192b864b..a43971f7 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -63,11 +63,12 @@ def parse_arguments(): return args -def check_for_data_files(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) +def check_for_data_files(file_paths): + for path in file_paths: + if os.path.exists(path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) def data_to_csv(args, data, file_path): diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index e488d25e..e0bbf85b 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -74,11 +74,12 @@ def parse_arguments(): return args -def check_for_data_files(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) +def check_for_data_files(file_paths): + for path in file_paths: + if os.path.exists(path): + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) def data_to_csv(args, data, file_path): From 8f4f0792b7de6d7cd1e63ab5e70f4b0f42afe199 Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Sat, 10 Jan 2026 16:56:53 +0100 Subject: [PATCH 10/10] Added the parser argument --force for regenerating processed files --- scripts/2-process/gcs_process.py | 14 +++++++++----- scripts/2-process/github_process.py | 12 +++++++++--- scripts/2-process/wikipedia_process.py | 12 +++++++++--- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 15501692..ab174a45 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -59,8 +59,12 @@ def parse_arguments(): parser.add_argument( "--enable-git", action="store_true", - help="Enable git actions such as fetch, merge, add, commit, and push" - " (default: False)", + help="Enable git actions such as fetch, merge, add, commit, and push", + ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", ) args = parser.parse_args() if not args.enable_save and args.enable_git: @@ -73,9 +77,9 @@ def parse_arguments(): return args -def check_for_data_files(file_paths): +def check_for_data_files(args, file_paths): for path in file_paths: - if os.path.exists(path): + if os.path.exists(path) and not args.force: raise shared.QuantifyingException( f"Processed data already exists for {QUARTER}", 0 ) @@ -327,7 +331,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - check_for_data_files(FILE_PATHS) + check_for_data_files(args, FILE_PATHS) # Count data file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index a43971f7..91110d66 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -52,6 +52,12 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") @@ -63,9 +69,9 @@ def parse_arguments(): return args -def check_for_data_files(file_paths): +def check_for_data_files(args, file_paths): for path in file_paths: - if os.path.exists(path): + if os.path.exists(path) and not args.force: raise shared.QuantifyingException( f"Processed data already exists for {QUARTER}", 0 ) @@ -144,7 +150,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - check_for_data_files(FILE_PATHS) + check_for_data_files(args, FILE_PATHS) file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv") count_data = shared.open_data_file( LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"] diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index e0bbf85b..b7c6b7a4 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -63,6 +63,12 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") @@ -74,9 +80,9 @@ def parse_arguments(): return args -def check_for_data_files(file_paths): +def check_for_data_files(args, file_paths): for path in file_paths: - if os.path.exists(path): + if os.path.exists(path) and not args.force: raise shared.QuantifyingException( f"Processed data already exists for {QUARTER}", 0 ) @@ -166,7 +172,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - check_for_data_files(FILE_PATHS) + check_for_data_files(args, FILE_PATHS) file_count = shared.path_join( PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv" )