From ff20930e69d3a5cfc13a4045b3ebad956df6cb0f Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Fri, 25 Jul 2025 17:45:31 -0400 Subject: [PATCH 1/8] wip --- README.md | 5 +- github_config.yml | 28 +++++++++ pymetrics/__main__.py | 37 +++++++++++ pymetrics/gh_downloads.py | 129 ++++++++++++++++++++++++++++++++++++++ pymetrics/github.py | 80 +++++++++++++++++++++++ 5 files changed, 276 insertions(+), 3 deletions(-) create mode 100644 github_config.yml create mode 100644 pymetrics/gh_downloads.py create mode 100644 pymetrics/github.py diff --git a/README.md b/README.md index 6282187..f848eab 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,8 @@ Currently, the download data is collected from the following distributions: - Replace `{package_name}` with the specific package (`sdv`) in the Anaconda channel - For each file returned by the API endpoint, the current number of downloads is saved. Over time, a historical download recording can be built. -### Future Data Sources -In the future, we may expand the source distributions to include: -* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub releases. +* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub release assets. + See this [GitHub API](https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#get-a-release). # Install Install pymetrics using pip (or uv): diff --git a/github_config.yml b/github_config.yml new file mode 100644 index 0000000..47c00e0 --- /dev/null +++ b/github_config.yml @@ -0,0 +1,28 @@ +projects: + sdv-dev: + - sdv-dev/SDV + - sdv-dev/RDT + - sdv-dev/SDMetrics + - sdv-dev/SDGym + - sdv-dev/Copulas + - sdv-dev/CTGAN + - sdv-dev/DeepEcho + gretel: + - gretelai/gretel-python-client + - gretelai/trainer + - gretelai/gretel-synthetics + mostly-ai: + - mostly-ai/mostlyai + - mostly-ai/mostlyai-mock + ydata: + - ydataai/ydata-synthetic + - ydataai/ydata-quality + - ydataai/ydata-fabric-sdk + realtabformer: + - worldbank/REaLTabFormer + synthcity: + - vanderschaarlab/synthcity + smartnoise-sdk: + - opendp/smartnoise-sdk + be_great: + - kathrinse/be_great \ No newline at end of file diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py index 8b925c4..66f6d17 100644 --- a/pymetrics/__main__.py +++ b/pymetrics/__main__.py @@ -10,6 +10,7 @@ import yaml from pymetrics.anaconda import collect_anaconda_downloads +from pymetrics.gh_downloads import collect_github_downloads from pymetrics.main import collect_pypi_downloads from pymetrics.summarize import summarize_downloads @@ -76,6 +77,19 @@ def _collect_anaconda(args): ) +def _collect_github(args): + config = _load_config(args.config_file) + projects = config['projects'] + output_folder = args.output_folder + + collect_github_downloads( + projects=projects, + output_folder=output_folder, + dry_run=args.dry_run, + verbose=args.verbose, + ) + + def _summarize(args): config = _load_config(args.config_file) projects = config['projects'] @@ -243,6 +257,29 @@ def _get_parser(): default=90, help='Max days of data to pull. Default to last 90 days.', ) + + # collect Anaconda + collect_github = action.add_parser( + 'collect-github', help='Collect download data from GitHub.', parents=[logging_args] + ) + collect_github.set_defaults(action=_collect_github) + collect_github.add_argument( + '-c', + '--config-file', + type=str, + default='config.yaml', + help='Path to the configuration file.', + ) + collect_github.add_argument( + '-o', + '--output-folder', + type=str, + required=True, + help=( + 'Path to the folder where data will be outputted. It can be a local path or a' + ' Google Drive folder path in the format gdrive://' + ), + ) return parser diff --git a/pymetrics/gh_downloads.py b/pymetrics/gh_downloads.py new file mode 100644 index 0000000..5fefda8 --- /dev/null +++ b/pymetrics/gh_downloads.py @@ -0,0 +1,129 @@ +"""Functions to get GitHub downloads from GitHub.""" + +import logging +import os +from collections import defaultdict +from datetime import datetime +from zoneinfo import ZoneInfo + +import pandas as pd +from tqdm import tqdm + +from pymetrics.github import GithubClient +from pymetrics.output import append_row, create_csv, get_path, load_csv +from pymetrics.time_utils import drop_duplicates_by_date + +LOGGER = logging.getLogger(__name__) +dir_path = os.path.dirname(os.path.realpath(__file__)) +TIME_COLUMN = 'timestamp' + +GITHUB_DOWNLOAD_COUNT_FILENAME = 'github_download_counts.csv' + + +def get_previous_github_downloads(output_folder, dry_run=False): + csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME) + read_csv_kwargs = { + 'parse_dates': [ + TIME_COLUMN, + 'created_at', + ], + 'dtype': { + 'ecosystem_name': pd.CategoricalDtype(), + 'org_repo': pd.CategoricalDtype(), + 'tag_name': pd.CategoricalDtype(), + 'prerelease': pd.BooleanDtype(), + 'download_count': pd.Int64Dtype(), + }, + } + data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs) + return data + + +def collect_github_downloads( + projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False +): + overall_df = get_previous_github_downloads(output_folder=output_folder) + # overall_df = pd.DataFrame( + # columns=[ + # TIME_COLUMN, + # 'created_at', + # 'ecosystem_name', + # 'org_repo', + # 'tag_name', + # 'prerelease', + # 'download_count', + # ] + # ) + + gh_client = GithubClient() + download_counts = defaultdict(int) + + for ecosystem_name, repositories in tqdm(projects.items(), position=2, desc='Overall'): + for org_repo in tqdm(repositories, position=1, desc=f'For Ecosystem: {ecosystem_name}'): + pages_remain = True + page = 1 + per_page = 100 + download_counts[org_repo] = 0 + + github_org = org_repo.split('/')[0] + repo = org_repo.split('/')[1] + + while pages_remain is True: + response = gh_client.get( + github_org, + repo, + endpoint='releases', + query_params={'per_page': per_page, 'page': page}, + ) + release_data = response.json() + link_header = response.headers.get('link') + + if response.status_code == 404: + LOGGER.debug(f'Skipping: {org_repo} because org/repo does not exist') + pages_remain = False + break + + # Get download count + for release_info in tqdm( + release_data, position=0, desc=f'For {repo} releases, page: {page}' + ): + release_id = release_info.get('id') + tag_name = release_info.get('tag_name') + prerelease = release_info.get('prerelease') + created_at = release_info.get('created_at') + endpoint = f'releases/{release_id}' + timestamp = datetime.now(ZoneInfo('UTC')) + + response = gh_client.get(github_org, repo, endpoint=endpoint) + data = response.json() + assets = data.get('assets') + tag_row = { + 'ecosystem_name': [ecosystem_name], + 'org_repo': [org_repo], + 'timestamp': [timestamp], + 'tag_name': [tag_name], + 'prerelease': [prerelease], + 'created_at': [created_at], + 'download_count': 0, + } + if assets and len(assets) > 0: + for asset in assets: + tag_row['download_count'] += asset.get('download_count', 0) + + overall_df = append_row(overall_df, tag_row) + + # Check pagination + if link_header and 'rel="next"' in link_header: + page += 1 + else: + break + overall_df = drop_duplicates_by_date( + overall_df, + time_column=TIME_COLUMN, + group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'], + ) + overall_df.to_csv('github_download_counts.csv', index=False) + + if not dry_run: + gfolder_path = f'{output_folder}/{GITHUB_DOWNLOAD_COUNT_FILENAME}' + create_csv(output_path=gfolder_path, data=overall_df) diff --git a/pymetrics/github.py b/pymetrics/github.py new file mode 100644 index 0000000..db8f974 --- /dev/null +++ b/pymetrics/github.py @@ -0,0 +1,80 @@ +"""Clients for making requests to Github APIs.""" + +import os + +import requests + + +class BaseClient: + """Base GitHub client.""" + + def __init__(self): + token = os.getenv('GH_ACCESS_TOKEN') + self.headers = { + 'Authorization': f'Bearer {token}', + 'Accept': 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28', + } + + +class GithubClient(BaseClient): + """Client for GitHub API.""" + + def __init__(self): + super().__init__() + self.base_url = 'https://api.github.com/repos' + + def _construct_url(self, github_org: str, repo: str, resource: str, id: str | None = None): + url = f'{self.base_url}/{github_org}/{repo}/{resource}' + if id: + url += f'/{id}' + return url + + def get( + self, + github_org: str, + repo: str, + endpoint: str, + query_params: dict | None = None, + timeout: int | None = None, + ): + """Get a specific value of a resource from an endpoint in the GitHub API. + + Args: + github_org (str): + The name of the GitHub organization to search. + repo (str): + The name of the repository to search. + endpoint (str): + The endpoint for the resource. For example, issues/{issue_number}. This means we'd + be making a request to https://api.github.com/repos/{github_org}/{repo}/issues/{issue_number}. + query_params (dict): + A dictionary mapping any query parameters to the desired value. Defaults to None. + timeout (int): + How long to wait before the request times out. Defaults to None. + + Returns: + requests.models.Response + """ + url = self._construct_url(github_org, repo, endpoint) + return requests.get(url, headers=self.headers, params=query_params, timeout=timeout) + + def post(self, github_org: str, repo: str, endpoint: str, payload: dict): + """Post to an endpooint in the GitHub API. + + Args: + github_org (str): + The name of the GitHub organization to search. + repo (str): + The name of the repository to search. + endpoint (str): + The endpoint for the resource. For example, issues. This means we'd be + making a request to https://api.github.com/repos/{github_org}/{repo}/issues. + payload (dict): + The payload to post. + + Returns: + requests.models.Response + """ + url = self._construct_url(github_org, repo, endpoint) + return requests.post(url, headers=self.headers, json=payload) From 5aaa62e9535f7a8faa2b6ed7ba2de5aef8cf92ea Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 28 Jul 2025 11:26:46 -0400 Subject: [PATCH 2/8] add prerelease, postrelease, devrelease --- .github/workflows/daily_collection.yaml | 12 ++++- .github/workflows/daily_summarization.yaml | 7 ++- README.md | 3 ++ pymetrics/__main__.py | 2 +- pymetrics/anaconda.py | 6 ++- pymetrics/gh_downloads.py | 45 ++++++++++-------- pymetrics/metrics.py | 54 +++++++++++++++------- pymetrics/time_utils.py | 6 +++ 8 files changed, 94 insertions(+), 41 deletions(-) diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml index 418300f..9810d74 100644 --- a/.github/workflows/daily_collection.yaml +++ b/.github/workflows/daily_collection.yaml @@ -27,6 +27,9 @@ jobs: timeout-minutes: 25 steps: - uses: actions/checkout@v4 + with: + repository: sdv-dev/PyMetrics + token: ${{ secrets.GH_TOKEN }} - name: Install uv uses: astral-sh/setup-uv@v6 with: @@ -56,6 +59,13 @@ jobs: env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} ANACONDA_OUTPUT_FOLDER: ${{ secrets.ANACONDA_OUTPUT_FOLDER }} + - name: Collect GitHub Downloads + run: | + uv run pymetrics collect-github \ + --output-folder ${{ secrets.GH_OUTPUT_FOLDER }} + env: + PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} + GH_OUTPUT_FOLDER: ${{ secrets.GH_OUTPUT_FOLDER }} alert: needs: [collect] runs-on: ubuntu-latest @@ -77,4 +87,4 @@ jobs: -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \ -m 'Daily Collection PyMetrics failed :fire: :dumpster-fire: :fire:' env: - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/daily_summarization.yaml b/.github/workflows/daily_summarization.yaml index 04350c3..305d856 100644 --- a/.github/workflows/daily_summarization.yaml +++ b/.github/workflows/daily_summarization.yaml @@ -1,4 +1,4 @@ -name: Daily Summarization +name: Daily Summarize on: workflow_dispatch: @@ -17,6 +17,9 @@ jobs: timeout-minutes: 10 steps: - uses: actions/checkout@v4 + with: + repository: sdv-dev/PyMetrics + token: ${{ secrets.GH_TOKEN }} - name: Install uv uses: astral-sh/setup-uv@v6 with: @@ -71,4 +74,4 @@ jobs: -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \ -m 'Daily Summarize PyMetrics failed :fire: :dumpster-fire: :fire:' env: - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} \ No newline at end of file diff --git a/README.md b/README.md index f848eab..2136f16 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,9 @@ The aggregation metrics spreasheets contain the following tabs: * **By Month and Python Version:** Absolute number of downloads per month and Python version. * **By Month and Country Code:** Absolute number of downloads per month and country. * **By Month and Installer Name:** Absolute number of downloads per month and Installer. +* **By Prerelease**: Absolute and relative number of downloads for pre-release versions (alpha, beta, release candidate, and development versions). +* **By Postrelease**: Absolute and relative number of downloads for post-release versions. +* **By Devrelease**: Absolute and relative number of downloads for development release versions. ## Known Issues 1. The conda package download data for Anaconda does not match the download count shown on the website. This is due to missing download data in the conda package download data. See this: https://github.com/anaconda/anaconda-package-data/issues/45 diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py index 66f6d17..49a8cae 100644 --- a/pymetrics/__main__.py +++ b/pymetrics/__main__.py @@ -267,7 +267,7 @@ def _get_parser(): '-c', '--config-file', type=str, - default='config.yaml', + default='github_config.yaml', help='Path to the configuration file.', ) collect_github.add_argument( diff --git a/pymetrics/anaconda.py b/pymetrics/anaconda.py index f25bef2..524e8e3 100644 --- a/pymetrics/anaconda.py +++ b/pymetrics/anaconda.py @@ -10,7 +10,7 @@ from tqdm import tqdm from pymetrics.output import append_row, create_csv, get_path, load_csv -from pymetrics.time_utils import drop_duplicates_by_date +from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc LOGGER = logging.getLogger(__name__) dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -89,7 +89,7 @@ def _get_downloads_from_anaconda_org(packages, channel='conda-forge'): for pkg_name in packages: URL = f'https://api.anaconda.org/package/{channel}/{pkg_name}' - timestamp = datetime.now(ZoneInfo('UTC')) + timestamp = get_current_utc() response = requests.get(URL) row_info = {'pkg_name': [pkg_name], TIME_COLUMN: [timestamp], 'total_ndownloads': 0} data = response.json() @@ -158,6 +158,8 @@ def collect_anaconda_downloads( `start_date` has not been provided. Defaults to 90 days. dry_run (bool): If `True`, do not upload the results. Defaults to `False`. + verbose (bool): + If `True`, will output dataframes tails of anaconda data. Defaults to `False`. """ overall_df, version_downloads = _collect_ananconda_downloads_from_website( projects, output_folder=output_folder diff --git a/pymetrics/gh_downloads.py b/pymetrics/gh_downloads.py index 5fefda8..205acce 100644 --- a/pymetrics/gh_downloads.py +++ b/pymetrics/gh_downloads.py @@ -3,15 +3,13 @@ import logging import os from collections import defaultdict -from datetime import datetime -from zoneinfo import ZoneInfo import pandas as pd from tqdm import tqdm from pymetrics.github import GithubClient from pymetrics.output import append_row, create_csv, get_path, load_csv -from pymetrics.time_utils import drop_duplicates_by_date +from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc LOGGER = logging.getLogger(__name__) dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -21,6 +19,7 @@ def get_previous_github_downloads(output_folder, dry_run=False): + """Get previous GitHub Downloads.""" csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME) read_csv_kwargs = { 'parse_dates': [ @@ -42,24 +41,29 @@ def get_previous_github_downloads(output_folder, dry_run=False): def collect_github_downloads( projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False ): + """Pull data about the downloads of a GitHub project. + + Args: + projects (dict[str, list[str]]): + List of projects to analyze. Each key is the name of the ecosystem, and + each value is a list of github repositories (including organization). + output_folder (str): + Folder in which project downloads will be stored. + It can be passed as a local folder or as a Google Drive path in the format + `gdrive://{folder_id}`. + The folder must contain 'github_download_counts.csv' + dry_run (bool): + If `True`, do not upload the results. Defaults to `False`. + verbose (bool): + If `True`, will output dataframes heads of github download data. Defaults to `False`. + """ overall_df = get_previous_github_downloads(output_folder=output_folder) - # overall_df = pd.DataFrame( - # columns=[ - # TIME_COLUMN, - # 'created_at', - # 'ecosystem_name', - # 'org_repo', - # 'tag_name', - # 'prerelease', - # 'download_count', - # ] - # ) gh_client = GithubClient() download_counts = defaultdict(int) - for ecosystem_name, repositories in tqdm(projects.items(), position=2, desc='Overall'): - for org_repo in tqdm(repositories, position=1, desc=f'For Ecosystem: {ecosystem_name}'): + for ecosystem_name, repositories in projects.items(): + for org_repo in tqdm(repositories, position=1, desc=f'Ecosystem: {ecosystem_name}'): pages_remain = True page = 1 per_page = 100 @@ -85,18 +89,19 @@ def collect_github_downloads( # Get download count for release_info in tqdm( - release_data, position=0, desc=f'For {repo} releases, page: {page}' + release_data, position=0, desc=f'{repo} releases, page={page}' ): release_id = release_info.get('id') tag_name = release_info.get('tag_name') prerelease = release_info.get('prerelease') created_at = release_info.get('created_at') endpoint = f'releases/{release_id}' - timestamp = datetime.now(ZoneInfo('UTC')) + timestamp = get_current_utc() response = gh_client.get(github_org, repo, endpoint=endpoint) data = response.json() assets = data.get('assets') + tag_row = { 'ecosystem_name': [ecosystem_name], 'org_repo': [org_repo], @@ -122,6 +127,10 @@ def collect_github_downloads( time_column=TIME_COLUMN, group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'], ) + if verbose: + LOGGER.info(f'{GITHUB_DOWNLOAD_COUNT_FILENAME} tail') + LOGGER.info(overall_df.tail(5).to_string()) + overall_df.to_csv('github_download_counts.csv', index=False) if not dry_run: diff --git a/pymetrics/metrics.py b/pymetrics/metrics.py index 154dcef..d2796d0 100644 --- a/pymetrics/metrics.py +++ b/pymetrics/metrics.py @@ -80,6 +80,9 @@ def _get_sheet_name(column): 'OS_type', 'cpu', 'ci', + 'is_prerelease', + 'is_postrelease', + 'is_devrelease', ] SORT_BY_DOWNLOADS = [ 'country_code', @@ -106,6 +109,30 @@ def _get_sheet_name(column): ] +def _safe_version_parse(version_str): + if pd.isna(version_str): + return np.nan + + try: + version = Version(str(version_str)) + except InvalidVersion: + cleaned = str(version_str).rstrip('+~') + try: + version = Version(cleaned) + except (InvalidVersion, TypeError): + LOGGER.info(f'Unable to parse version: {version_str}') + version = np.nan + + return version + + +def _extract_version_attribute(version_str, attribute): + version_obj = _safe_version_parse(version_str) + if isinstance(version_obj, Version): + return getattr(version_obj, attribute) + return np.nan + + def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) for col in [ @@ -124,24 +151,17 @@ def _mangle_columns(downloads): downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version'] downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel'] - return downloads - - -def _safe_version_parse(version_str): - if pd.isna(version_str): - return np.nan - - try: - version = Version(str(version_str)) - except InvalidVersion: - cleaned = str(version_str).rstrip('+~') - try: - version = Version(cleaned) - except (InvalidVersion, TypeError): - LOGGER.info(f'Unable to parse version: {version_str}') - version = np.nan + downloads['is_prerelease'] = downloads['version'].apply( + _extract_version_attribute, args=('is_prerelease',) + ) + downloads['is_postrelease'] = downloads['version'].apply( + _extract_version_attribute, args=('is_postrelease',) + ) + downloads['is_devrelease'] = downloads['version'].apply( + _extract_version_attribute, args=('is_devrelease',) + ) - return version + return downloads def _version_order_key(version_column): diff --git a/pymetrics/time_utils.py b/pymetrics/time_utils.py index 7e0b1e5..af4d35a 100644 --- a/pymetrics/time_utils.py +++ b/pymetrics/time_utils.py @@ -1,6 +1,7 @@ """Time utility functions.""" from datetime import datetime +from zoneinfo import ZoneInfo import pandas as pd from pandas.api.types import is_datetime64_any_dtype @@ -11,6 +12,11 @@ def get_current_year(tz=None): return datetime.now(tz=tz).year +def get_current_utc(): + """Get the current datetime UTC.""" + return datetime.now(ZoneInfo('UTC')) + + def get_first_datetime_in_year(year, tzinfo=None): """Get the first possible datetime value in a given year.""" min_date = datetime(year, day=1, month=1).date() From a3970fd31b0535baed0c2e1755da7e2cefc92119 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 28 Jul 2025 11:28:59 -0400 Subject: [PATCH 3/8] fix name --- .github/workflows/daily_summarization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/daily_summarization.yaml b/.github/workflows/daily_summarization.yaml index 305d856..f777855 100644 --- a/.github/workflows/daily_summarization.yaml +++ b/.github/workflows/daily_summarization.yaml @@ -1,4 +1,4 @@ -name: Daily Summarize +name: Daily Summarization on: workflow_dispatch: @@ -72,6 +72,6 @@ jobs: uv run python -m pymetrics.slack_utils \ -r ${{ github.run_id }} \ -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \ - -m 'Daily Summarize PyMetrics failed :fire: :dumpster-fire: :fire:' + -m 'Daily Summarization PyMetrics failed :fire: :dumpster-fire: :fire:' env: SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} \ No newline at end of file From 680c76bd7f23f0f5a44d118e62d192fd6edd1114 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 28 Jul 2025 11:29:30 -0400 Subject: [PATCH 4/8] fix name --- pymetrics/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py index 49a8cae..c2419e6 100644 --- a/pymetrics/__main__.py +++ b/pymetrics/__main__.py @@ -258,7 +258,7 @@ def _get_parser(): help='Max days of data to pull. Default to last 90 days.', ) - # collect Anaconda + # collect GitHub downloads collect_github = action.add_parser( 'collect-github', help='Collect download data from GitHub.', parents=[logging_args] ) From 1f568a41a61eb539fc75c551eb05e832c065ed4b Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Mon, 28 Jul 2025 11:31:03 -0400 Subject: [PATCH 5/8] cleanup --- pymetrics/anaconda.py | 5 ++--- pymetrics/pypi.py | 5 +++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pymetrics/anaconda.py b/pymetrics/anaconda.py index 524e8e3..db1c7f1 100644 --- a/pymetrics/anaconda.py +++ b/pymetrics/anaconda.py @@ -2,8 +2,7 @@ import logging import os -from datetime import datetime, timedelta -from zoneinfo import ZoneInfo +from datetime import timedelta import pandas as pd import requests @@ -168,7 +167,7 @@ def collect_anaconda_downloads( previous = _get_previous_anaconda_downloads(output_folder, filename=PREVIOUS_ANACONDA_FILENAME) previous = previous.sort_values(TIME_COLUMN) - end_date = datetime.now(tz=ZoneInfo('UTC')).date() + end_date = get_current_utc().date() start_date = end_date - timedelta(days=max_days) LOGGER.info(f'Getting daily anaconda data for start_date>={start_date} to end_date<{end_date}') date_ranges = pd.date_range(start=start_date, end=end_date, freq='D') diff --git a/pymetrics/pypi.py b/pymetrics/pypi.py index 84d74d7..879f545 100644 --- a/pymetrics/pypi.py +++ b/pymetrics/pypi.py @@ -1,11 +1,12 @@ """Functions to get PyPI downloads from Google Big Query.""" import logging -from datetime import datetime, timedelta, timezone +from datetime import timedelta import pandas as pd from pymetrics.bq import run_query +from pymetrics.time_utils import get_current_utc LOGGER = logging.getLogger(__name__) @@ -69,7 +70,7 @@ def _get_query(projects, start_date, end_date): def _get_query_dates(start_date, min_date, max_date, max_days, force=False): - end_date = datetime.now(timezone.utc).date() + end_date = get_current_utc().date() if start_date is None: start_date = end_date - timedelta(days=max_days) From 4f80d1d36e8e5f44d8c281958b4d6ec49fcebaa1 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 29 Jul 2025 10:29:09 -0400 Subject: [PATCH 6/8] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2136f16..dc60e50 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Currently, the download data is collected from the following distributions: - Replace `{package_name}` with the specific package (`sdv`) in the Anaconda channel - For each file returned by the API endpoint, the current number of downloads is saved. Over time, a historical download recording can be built. -* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub release assets. +* [GitHub Releases](https://docs.github.com/en/rest/releases): Information about the project downloads from GitHub release assets. See this [GitHub API](https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#get-a-release). # Install From d9b7b9a44d40e284cb5940472e197f72ff080199 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 29 Jul 2025 15:53:46 -0400 Subject: [PATCH 7/8] Update metrics.py --- pymetrics/metrics.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pymetrics/metrics.py b/pymetrics/metrics.py index d2796d0..96a25a8 100644 --- a/pymetrics/metrics.py +++ b/pymetrics/metrics.py @@ -112,17 +112,10 @@ def _get_sheet_name(column): def _safe_version_parse(version_str): if pd.isna(version_str): return np.nan - try: version = Version(str(version_str)) except InvalidVersion: - cleaned = str(version_str).rstrip('+~') - try: - version = Version(cleaned) - except (InvalidVersion, TypeError): - LOGGER.info(f'Unable to parse version: {version_str}') - version = np.nan - + version = np.nan return version From 75617c1f6b387d7527d156dd7d7c46ff8210a286 Mon Sep 17 00:00:00 2001 From: Gaurav Sheni Date: Tue, 29 Jul 2025 15:56:26 -0400 Subject: [PATCH 8/8] fix unit test --- tests/unit/test_metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py index f54ff07..8eb5fe7 100644 --- a/tests/unit/test_metrics.py +++ b/tests/unit/test_metrics.py @@ -25,17 +25,17 @@ def test__sort_by_version(): def test__sort_by_version_with_invalid_versions(): # Setup data = pd.DataFrame({ - 'version': pd.Series(['2.7.11+', '2.0.0', 'invalid', '3.0', np.nan], dtype='object'), - 'name': ['v4', 'v3', 'v2', 'v5', 'v1'], + 'version': pd.Series(['2.0.0', 'invalid', '3.0', np.nan], dtype='object'), + 'name': ['v3', 'v2', 'v4', 'v1'], }) # Run sorted_df = _sort_by_version(data, 'version') # Assert - expected_versions = ['3.0', '2.7.11+', '2.0.0', 'invalid', np.nan] + expected_versions = ['3.0', '2.0.0', 'invalid', np.nan] assert sorted_df['version'].tolist() == expected_versions - assert sorted_df['name'].tolist() == ['v5', 'v4', 'v3', 'v2', 'v1'] + assert sorted_df['name'].tolist() == ['v4', 'v3', 'v2', 'v1'] def test__sort_by_version_with_mixed_version_formats():