Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions scripts/2-process/gcs_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_combined_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_lastest_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_prior_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_status_retired_totals.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_country.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_free_cultural.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_language.csv"),
shared.path_join(PATHS["data_phase"], "gcs_totals_by_restrictions.csv"),
]


def parse_arguments():
Expand All @@ -48,8 +59,12 @@ def parse_arguments():
parser.add_argument(
"--enable-git",
action="store_true",
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
help="Enable git actions such as fetch, merge, add, commit, and push",
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate data even if processed files already exist",
)
args = parser.parse_args()
if not args.enable_save and args.enable_git:
Expand All @@ -62,6 +77,14 @@ def parse_arguments():
return args


def check_for_data_files(args, file_paths):
for path in file_paths:
if os.path.exists(path) and not args.force:
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
Expand Down Expand Up @@ -308,6 +331,7 @@ def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
check_for_data_files(args, FILE_PATHS)

# Count data
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
Expand Down
25 changes: 17 additions & 8 deletions scripts/2-process/github_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(PATHS["data_phase"], "github_totals_by_license.csv"),
shared.path_join(PATHS["data_phase"], "github_totals_by_restriction.csv"),
]


def parse_arguments():
Expand All @@ -48,6 +52,12 @@ def parse_arguments():
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate data even if processed files already exist",
)

args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
Expand All @@ -59,11 +69,12 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)
def check_for_data_files(args, file_paths):
for path in file_paths:
if os.path.exists(path) and not args.force:
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
Expand Down Expand Up @@ -98,7 +109,6 @@ def process_totals_by_license(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_license.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down Expand Up @@ -133,15 +143,14 @@ def process_totals_by_restriction(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_restriction.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

check_for_data_files(args, FILE_PATHS)
file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
count_data = shared.open_data_file(
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
Expand Down
32 changes: 24 additions & 8 deletions scripts/2-process/wikipedia_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
FILE_PATHS = [
shared.path_join(
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
),
shared.path_join(
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
),
shared.path_join(
PATHS["data_phase"], "wikipedia_language_representation.csv"
),
]


def parse_arguments():
Expand All @@ -52,6 +63,12 @@ def parse_arguments():
help="Enable git actions such as fetch, merge, add, commit, and push"
" (default: False)",
)
parser.add_argument(
"--force",
action="store_true",
help="Regenerate data even if processed files already exist",
)

args = parser.parse_args()
if not args.enable_save and args.enable_git:
parser.error("--enable-git requires --enable-save")
Expand All @@ -63,11 +80,12 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)
def check_for_data_files(args, file_paths):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would exit function early if args.force (avoids doing filesystem lookups):

    if args.force:
        return

We already do a similar thing in shared.py:

def git_push_changes(args, repo_path):
if not args.enable_git:
return

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, maybe it's worth moving this function to shared.py since it's the same in multiple files

for path in file_paths:
if os.path.exists(path) and not args.force:
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
Expand Down Expand Up @@ -98,7 +116,6 @@ def process_highest_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
)
check_for_data_file(file_path)
data_to_csv(args, top_10, file_path)


Expand All @@ -122,7 +139,6 @@ def process_least_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
)
check_for_data_file(file_path)
data_to_csv(args, bottom_10, file_path)


Expand All @@ -149,14 +165,14 @@ def process_language_representation(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_language_representation.csv"
)
check_for_data_file(file_path)
data_to_csv(args, language_counts, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])
check_for_data_files(args, FILE_PATHS)
file_count = shared.path_join(
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
)
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/gcs_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -27,7 +28,7 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Google Custom Search (GCS)"
SECTION = Path(__file__).name
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, I didn't think about the displayed section title.

I think we need both SECTION_FILE and SECTION_TITLE or something similar. One for comments and one for heading/display



def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/github_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@
# Setup
LOGGER, PATHS = shared.setup(__file__)
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "GitHub data"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/wikipedia_report.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@
# Setup
LOGGER, PATHS = shared.setup(__file__)
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Wikipedia data"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
3 changes: 2 additions & 1 deletion scripts/3-report/notes.py → scripts/3-report/zzz-notes.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import textwrap
import traceback
from pathlib import Path

# Third-party
from pygments import highlight
Expand All @@ -25,7 +26,7 @@

# Constants
QUARTER = os.path.basename(PATHS["data_quarter"])
SECTION = "Notes"
SECTION = Path(__file__).name


def parse_arguments():
Expand Down
68 changes: 45 additions & 23 deletions scripts/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,12 @@ def setup(current_file):
return logger, paths


def section_order():
report_dir = os.path.join(os.path.dirname(__file__), "3-report")
report_files = os.listdir(report_dir)
return report_files
Comment on lines +272 to +275
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function doesn't establish section order. please update to use .sort() or sorted()



def update_readme(
args,
section_title,
Expand All @@ -280,6 +286,12 @@ def update_readme(
"""
Update the README.md file with the generated images and descriptions.
"""
logger = args.logger
paths = args.paths
ordered_sections = section_order()
logger.info("ordered_sections:", ordered_sections)
logger.info("section_title:", repr(section_title))
Comment on lines +292 to +293
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect you converted these from print() statements. As they are now, I get a stack trace. Please convert to f-strings. For example:

logger.info(f"ordered_sections: {ordered_sections}")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For more information on the arguments accepted by logger.info(), see Logger.debug(): https://docs.python.org/3.11/library/logging.html#logging.Logger.debug


if not args.enable_save:
return
if image_path and not image_caption:
Expand All @@ -293,18 +305,15 @@ def update_readme(
" caption is provided"
)

logger = args.logger
paths = args.paths

readme_path = path_join(paths["data"], args.quarter, "README.md")

# Define section markers for each data source
section_start_line = f"<!-- {section_title} Start -->\n"
section_end_line = f"<!-- {section_title} End -->\n"
section_start_line = f"<!-- section start {section_title} -->\n"
section_end_line = f"<!-- section end {section_title} -->\n"

# Define entry markers for each plot (optional) and description
entry_start_line = f"<!-- {entry_title} Start -->\n"
entry_end_line = f"<!-- {entry_title} End -->\n"
entry_start_line = f"<!-- entry start {entry_title} -->\n"
entry_end_line = f"<!-- entry end {entry_title} -->\n"

if os.path.exists(readme_path):
with open(readme_path, "r", encoding="utf-8") as f:
Expand All @@ -318,26 +327,39 @@ def update_readme(
lines.insert(0, title_line)
lines.insert(1, "\n")

# We only need to know the position of the end to append new entries
# Locate the data source section if it is already present
if section_start_line in lines:
# Locate the data source section if it is already present
section_end_index = lines.index(section_end_line)
else:
# Add the data source section if it is absent
lines.extend(
[
f"{section_start_line}",
"\n",
"\n",
f"## {section_title}\n",
"\n",
"\n",
f"{section_end_line}",
"\n",
]
)
section_end_index = lines.index(section_end_line)
insert_index = None
# If not present, we find the position to insert the section
current_postion = ordered_sections.index(section_title)
# Sections that should come before this section
sections_before = ordered_sections[:current_postion]
# we find the last existing section that comes before this section
for prev_section_title in reversed(sections_before):
prev_end_line = f"<!-- section end {prev_section_title} -->\n"
if prev_end_line in lines:
insert_index = lines.index(prev_end_line) + 1
break

# If none exist, insert at the top (after README title)
if insert_index is None:
insert_index = 2 if len(lines) >= 2 else len(lines)
# Insert the new data source section at correct position
new_section_line = [
f"{section_start_line}",
"\n",
"\n",
f"## {section_title}\n",
"\n",
"\n",
f"{section_end_line}",
"\n",
]
# Insert the section at the correct position
lines = lines[:insert_index] + new_section_line + lines[insert_index:]
section_end_index = lines.index(section_end_line)
# Locate the entry if it is already present
if entry_start_line in lines:
entry_start_index = lines.index(entry_start_line)
Expand Down