diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index fefbba0f..ab174a45 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -27,6 +27,17 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"), + shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"), +] def parse_arguments(): @@ -48,8 +59,12 @@ def parse_arguments(): parser.add_argument( "--enable-git", action="store_true", - help="Enable git actions such as fetch, merge, add, commit, and push" - " (default: False)", + help="Enable git actions such as fetch, merge, add, commit, and push", + ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", ) args = parser.parse_args() if not args.enable_save and args.enable_git: @@ -62,6 +77,14 @@ def parse_arguments(): return args +def check_for_data_files(args, file_paths): + for path in file_paths: + if os.path.exists(path) and not args.force: + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) + + def data_to_csv(args, data, file_path): if not args.enable_save: return @@ -308,6 +331,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) + check_for_data_files(args, FILE_PATHS) # Count data file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 27945613..91110d66 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -24,6 +24,10 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"), + shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"), +] def parse_arguments(): @@ -48,6 +52,12 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") @@ -59,11 +69,12 @@ def parse_arguments(): return args -def check_for_data_file(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) +def check_for_data_files(args, file_paths): + for path in file_paths: + if os.path.exists(path) and not args.force: + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) def data_to_csv(args, data, file_path): @@ -98,7 +109,6 @@ def process_totals_by_license(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_license.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -133,7 +143,6 @@ def process_totals_by_restriction(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_restriction.csv" ) - check_for_data_file(file_path) data_to_csv(args, data, file_path) @@ -141,7 +150,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) - + check_for_data_files(args, FILE_PATHS) file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv") count_data = shared.open_data_file( LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"] diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 7712b26a..b7c6b7a4 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -28,6 +28,17 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) +FILE_PATHS = [ + shared.path_join( + PATHS["data_phase"], "wikipedia_highest_language_usage.csv" + ), + shared.path_join( + PATHS["data_phase"], "wikipedia_least_language_usage.csv" + ), + shared.path_join( + PATHS["data_phase"], "wikipedia_language_representation.csv" + ), +] def parse_arguments(): @@ -52,6 +63,12 @@ def parse_arguments(): help="Enable git actions such as fetch, merge, add, commit, and push" " (default: False)", ) + parser.add_argument( + "--force", + action="store_true", + help="Regenerate data even if processed files already exist", + ) + args = parser.parse_args() if not args.enable_save and args.enable_git: parser.error("--enable-git requires --enable-save") @@ -63,11 +80,12 @@ def parse_arguments(): return args -def check_for_data_file(file_path): - if os.path.exists(file_path): - raise shared.QuantifyingException( - f"Processed data already exists for {QUARTER}", 0 - ) +def check_for_data_files(args, file_paths): + for path in file_paths: + if os.path.exists(path) and not args.force: + raise shared.QuantifyingException( + f"Processed data already exists for {QUARTER}", 0 + ) def data_to_csv(args, data, file_path): @@ -98,7 +116,6 @@ def process_highest_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) - check_for_data_file(file_path) data_to_csv(args, top_10, file_path) @@ -122,7 +139,6 @@ def process_least_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) - check_for_data_file(file_path) data_to_csv(args, bottom_10, file_path) @@ -149,7 +165,6 @@ def process_language_representation(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_language_representation.csv" ) - check_for_data_file(file_path) data_to_csv(args, language_counts, file_path) @@ -157,6 +172,7 @@ def main(): args = parse_arguments() shared.paths_log(LOGGER, PATHS) shared.git_fetch_and_merge(args, PATHS["repo"]) + check_for_data_files(args, FILE_PATHS) file_count = shared.path_join( PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv" ) diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py old mode 100755 new mode 100644 index 359796a9..eb2a4581 --- a/scripts/3-report/gcs_report.py +++ b/scripts/3-report/gcs_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -27,7 +28,7 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Google Custom Search (GCS)" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py old mode 100755 new mode 100644 index 37979175..b2a18890 --- a/scripts/3-report/github_report.py +++ b/scripts/3-report/github_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "GitHub data" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/wikipedia_report.py b/scripts/3-report/wikipedia_report.py old mode 100755 new mode 100644 index 83a92fa3..96f0cf46 --- a/scripts/3-report/wikipedia_report.py +++ b/scripts/3-report/wikipedia_report.py @@ -9,6 +9,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Setup LOGGER, PATHS = shared.setup(__file__) QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Wikipedia data" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/3-report/notes.py b/scripts/3-report/zzz-notes.py old mode 100755 new mode 100644 similarity index 98% rename from scripts/3-report/notes.py rename to scripts/3-report/zzz-notes.py index ccefd058..d9d4aa93 --- a/scripts/3-report/notes.py +++ b/scripts/3-report/zzz-notes.py @@ -8,6 +8,7 @@ import sys import textwrap import traceback +from pathlib import Path # Third-party from pygments import highlight @@ -25,7 +26,7 @@ # Constants QUARTER = os.path.basename(PATHS["data_quarter"]) -SECTION = "Notes" +SECTION = Path(__file__).name def parse_arguments(): diff --git a/scripts/shared.py b/scripts/shared.py index 509801d9..db2a3ca7 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -269,6 +269,12 @@ def setup(current_file): return logger, paths +def section_order(): + report_dir = os.path.join(os.path.dirname(__file__), "3-report") + report_files = os.listdir(report_dir) + return report_files + + def update_readme( args, section_title, @@ -280,6 +286,12 @@ def update_readme( """ Update the README.md file with the generated images and descriptions. """ + logger = args.logger + paths = args.paths + ordered_sections = section_order() + logger.info("ordered_sections:", ordered_sections) + logger.info("section_title:", repr(section_title)) + if not args.enable_save: return if image_path and not image_caption: @@ -293,18 +305,15 @@ def update_readme( " caption is provided" ) - logger = args.logger - paths = args.paths - readme_path = path_join(paths["data"], args.quarter, "README.md") # Define section markers for each data source - section_start_line = f"\n" - section_end_line = f"\n" + section_start_line = f"\n" + section_end_line = f"\n" # Define entry markers for each plot (optional) and description - entry_start_line = f"\n" - entry_end_line = f"\n" + entry_start_line = f"\n" + entry_end_line = f"\n" if os.path.exists(readme_path): with open(readme_path, "r", encoding="utf-8") as f: @@ -318,26 +327,39 @@ def update_readme( lines.insert(0, title_line) lines.insert(1, "\n") - # We only need to know the position of the end to append new entries + # Locate the data source section if it is already present if section_start_line in lines: - # Locate the data source section if it is already present section_end_index = lines.index(section_end_line) else: - # Add the data source section if it is absent - lines.extend( - [ - f"{section_start_line}", - "\n", - "\n", - f"## {section_title}\n", - "\n", - "\n", - f"{section_end_line}", - "\n", - ] - ) - section_end_index = lines.index(section_end_line) + insert_index = None + # If not present, we find the position to insert the section + current_postion = ordered_sections.index(section_title) + # Sections that should come before this section + sections_before = ordered_sections[:current_postion] + # we find the last existing section that comes before this section + for prev_section_title in reversed(sections_before): + prev_end_line = f"\n" + if prev_end_line in lines: + insert_index = lines.index(prev_end_line) + 1 + break + # If none exist, insert at the top (after README title) + if insert_index is None: + insert_index = 2 if len(lines) >= 2 else len(lines) + # Insert the new data source section at correct position + new_section_line = [ + f"{section_start_line}", + "\n", + "\n", + f"## {section_title}\n", + "\n", + "\n", + f"{section_end_line}", + "\n", + ] + # Insert the section at the correct position + lines = lines[:insert_index] + new_section_line + lines[insert_index:] + section_end_index = lines.index(section_end_line) # Locate the entry if it is already present if entry_start_line in lines: entry_start_index = lines.index(entry_start_line)