diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml index 418300f..9810d74 100644 --- a/.github/workflows/daily_collection.yaml +++ b/.github/workflows/daily_collection.yaml @@ -27,6 +27,9 @@ jobs: timeout-minutes: 25 steps: - uses: actions/checkout@v4 + with: + repository: sdv-dev/PyMetrics + token: ${{ secrets.GH_TOKEN }} - name: Install uv uses: astral-sh/setup-uv@v6 with: @@ -56,6 +59,13 @@ jobs: env: PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} ANACONDA_OUTPUT_FOLDER: ${{ secrets.ANACONDA_OUTPUT_FOLDER }} + - name: Collect GitHub Downloads + run: | + uv run pymetrics collect-github \ + --output-folder ${{ secrets.GH_OUTPUT_FOLDER }} + env: + PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }} + GH_OUTPUT_FOLDER: ${{ secrets.GH_OUTPUT_FOLDER }} alert: needs: [collect] runs-on: ubuntu-latest @@ -77,4 +87,4 @@ jobs: -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \ -m 'Daily Collection PyMetrics failed :fire: :dumpster-fire: :fire:' env: - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/daily_summarization.yaml b/.github/workflows/daily_summarization.yaml index 04350c3..f777855 100644 --- a/.github/workflows/daily_summarization.yaml +++ b/.github/workflows/daily_summarization.yaml @@ -17,6 +17,9 @@ jobs: timeout-minutes: 10 steps: - uses: actions/checkout@v4 + with: + repository: sdv-dev/PyMetrics + token: ${{ secrets.GH_TOKEN }} - name: Install uv uses: astral-sh/setup-uv@v6 with: @@ -69,6 +72,6 @@ jobs: uv run python -m pymetrics.slack_utils \ -r ${{ github.run_id }} \ -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \ - -m 'Daily Summarize PyMetrics failed :fire: :dumpster-fire: :fire:' + -m 'Daily Summarization PyMetrics failed :fire: :dumpster-fire: :fire:' env: - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} + SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} \ No newline at end of file diff --git a/README.md b/README.md index 6282187..dc60e50 100644 --- a/README.md +++ b/README.md @@ -44,9 +44,8 @@ Currently, the download data is collected from the following distributions: - Replace `{package_name}` with the specific package (`sdv`) in the Anaconda channel - For each file returned by the API endpoint, the current number of downloads is saved. Over time, a historical download recording can be built. -### Future Data Sources -In the future, we may expand the source distributions to include: -* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub releases. +* [GitHub Releases](https://docs.github.com/en/rest/releases): Information about the project downloads from GitHub release assets. + See this [GitHub API](https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#get-a-release). # Install Install pymetrics using pip (or uv): @@ -143,6 +142,9 @@ The aggregation metrics spreasheets contain the following tabs: * **By Month and Python Version:** Absolute number of downloads per month and Python version. * **By Month and Country Code:** Absolute number of downloads per month and country. * **By Month and Installer Name:** Absolute number of downloads per month and Installer. +* **By Prerelease**: Absolute and relative number of downloads for pre-release versions (alpha, beta, release candidate, and development versions). +* **By Postrelease**: Absolute and relative number of downloads for post-release versions. +* **By Devrelease**: Absolute and relative number of downloads for development release versions. ## Known Issues 1. The conda package download data for Anaconda does not match the download count shown on the website. This is due to missing download data in the conda package download data. See this: https://github.com/anaconda/anaconda-package-data/issues/45 diff --git a/github_config.yml b/github_config.yml new file mode 100644 index 0000000..47c00e0 --- /dev/null +++ b/github_config.yml @@ -0,0 +1,28 @@ +projects: + sdv-dev: + - sdv-dev/SDV + - sdv-dev/RDT + - sdv-dev/SDMetrics + - sdv-dev/SDGym + - sdv-dev/Copulas + - sdv-dev/CTGAN + - sdv-dev/DeepEcho + gretel: + - gretelai/gretel-python-client + - gretelai/trainer + - gretelai/gretel-synthetics + mostly-ai: + - mostly-ai/mostlyai + - mostly-ai/mostlyai-mock + ydata: + - ydataai/ydata-synthetic + - ydataai/ydata-quality + - ydataai/ydata-fabric-sdk + realtabformer: + - worldbank/REaLTabFormer + synthcity: + - vanderschaarlab/synthcity + smartnoise-sdk: + - opendp/smartnoise-sdk + be_great: + - kathrinse/be_great \ No newline at end of file diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py index 8b925c4..c2419e6 100644 --- a/pymetrics/__main__.py +++ b/pymetrics/__main__.py @@ -10,6 +10,7 @@ import yaml from pymetrics.anaconda import collect_anaconda_downloads +from pymetrics.gh_downloads import collect_github_downloads from pymetrics.main import collect_pypi_downloads from pymetrics.summarize import summarize_downloads @@ -76,6 +77,19 @@ def _collect_anaconda(args): ) +def _collect_github(args): + config = _load_config(args.config_file) + projects = config['projects'] + output_folder = args.output_folder + + collect_github_downloads( + projects=projects, + output_folder=output_folder, + dry_run=args.dry_run, + verbose=args.verbose, + ) + + def _summarize(args): config = _load_config(args.config_file) projects = config['projects'] @@ -243,6 +257,29 @@ def _get_parser(): default=90, help='Max days of data to pull. Default to last 90 days.', ) + + # collect GitHub downloads + collect_github = action.add_parser( + 'collect-github', help='Collect download data from GitHub.', parents=[logging_args] + ) + collect_github.set_defaults(action=_collect_github) + collect_github.add_argument( + '-c', + '--config-file', + type=str, + default='github_config.yaml', + help='Path to the configuration file.', + ) + collect_github.add_argument( + '-o', + '--output-folder', + type=str, + required=True, + help=( + 'Path to the folder where data will be outputted. It can be a local path or a' + ' Google Drive folder path in the format gdrive://' + ), + ) return parser diff --git a/pymetrics/anaconda.py b/pymetrics/anaconda.py index f25bef2..db1c7f1 100644 --- a/pymetrics/anaconda.py +++ b/pymetrics/anaconda.py @@ -2,15 +2,14 @@ import logging import os -from datetime import datetime, timedelta -from zoneinfo import ZoneInfo +from datetime import timedelta import pandas as pd import requests from tqdm import tqdm from pymetrics.output import append_row, create_csv, get_path, load_csv -from pymetrics.time_utils import drop_duplicates_by_date +from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc LOGGER = logging.getLogger(__name__) dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -89,7 +88,7 @@ def _get_downloads_from_anaconda_org(packages, channel='conda-forge'): for pkg_name in packages: URL = f'https://api.anaconda.org/package/{channel}/{pkg_name}' - timestamp = datetime.now(ZoneInfo('UTC')) + timestamp = get_current_utc() response = requests.get(URL) row_info = {'pkg_name': [pkg_name], TIME_COLUMN: [timestamp], 'total_ndownloads': 0} data = response.json() @@ -158,6 +157,8 @@ def collect_anaconda_downloads( `start_date` has not been provided. Defaults to 90 days. dry_run (bool): If `True`, do not upload the results. Defaults to `False`. + verbose (bool): + If `True`, will output dataframes tails of anaconda data. Defaults to `False`. """ overall_df, version_downloads = _collect_ananconda_downloads_from_website( projects, output_folder=output_folder @@ -166,7 +167,7 @@ def collect_anaconda_downloads( previous = _get_previous_anaconda_downloads(output_folder, filename=PREVIOUS_ANACONDA_FILENAME) previous = previous.sort_values(TIME_COLUMN) - end_date = datetime.now(tz=ZoneInfo('UTC')).date() + end_date = get_current_utc().date() start_date = end_date - timedelta(days=max_days) LOGGER.info(f'Getting daily anaconda data for start_date>={start_date} to end_date<{end_date}') date_ranges = pd.date_range(start=start_date, end=end_date, freq='D') diff --git a/pymetrics/gh_downloads.py b/pymetrics/gh_downloads.py new file mode 100644 index 0000000..205acce --- /dev/null +++ b/pymetrics/gh_downloads.py @@ -0,0 +1,138 @@ +"""Functions to get GitHub downloads from GitHub.""" + +import logging +import os +from collections import defaultdict + +import pandas as pd +from tqdm import tqdm + +from pymetrics.github import GithubClient +from pymetrics.output import append_row, create_csv, get_path, load_csv +from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc + +LOGGER = logging.getLogger(__name__) +dir_path = os.path.dirname(os.path.realpath(__file__)) +TIME_COLUMN = 'timestamp' + +GITHUB_DOWNLOAD_COUNT_FILENAME = 'github_download_counts.csv' + + +def get_previous_github_downloads(output_folder, dry_run=False): + """Get previous GitHub Downloads.""" + csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME) + read_csv_kwargs = { + 'parse_dates': [ + TIME_COLUMN, + 'created_at', + ], + 'dtype': { + 'ecosystem_name': pd.CategoricalDtype(), + 'org_repo': pd.CategoricalDtype(), + 'tag_name': pd.CategoricalDtype(), + 'prerelease': pd.BooleanDtype(), + 'download_count': pd.Int64Dtype(), + }, + } + data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs) + return data + + +def collect_github_downloads( + projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False +): + """Pull data about the downloads of a GitHub project. + + Args: + projects (dict[str, list[str]]): + List of projects to analyze. Each key is the name of the ecosystem, and + each value is a list of github repositories (including organization). + output_folder (str): + Folder in which project downloads will be stored. + It can be passed as a local folder or as a Google Drive path in the format + `gdrive://{folder_id}`. + The folder must contain 'github_download_counts.csv' + dry_run (bool): + If `True`, do not upload the results. Defaults to `False`. + verbose (bool): + If `True`, will output dataframes heads of github download data. Defaults to `False`. + """ + overall_df = get_previous_github_downloads(output_folder=output_folder) + + gh_client = GithubClient() + download_counts = defaultdict(int) + + for ecosystem_name, repositories in projects.items(): + for org_repo in tqdm(repositories, position=1, desc=f'Ecosystem: {ecosystem_name}'): + pages_remain = True + page = 1 + per_page = 100 + download_counts[org_repo] = 0 + + github_org = org_repo.split('/')[0] + repo = org_repo.split('/')[1] + + while pages_remain is True: + response = gh_client.get( + github_org, + repo, + endpoint='releases', + query_params={'per_page': per_page, 'page': page}, + ) + release_data = response.json() + link_header = response.headers.get('link') + + if response.status_code == 404: + LOGGER.debug(f'Skipping: {org_repo} because org/repo does not exist') + pages_remain = False + break + + # Get download count + for release_info in tqdm( + release_data, position=0, desc=f'{repo} releases, page={page}' + ): + release_id = release_info.get('id') + tag_name = release_info.get('tag_name') + prerelease = release_info.get('prerelease') + created_at = release_info.get('created_at') + endpoint = f'releases/{release_id}' + + timestamp = get_current_utc() + response = gh_client.get(github_org, repo, endpoint=endpoint) + data = response.json() + assets = data.get('assets') + + tag_row = { + 'ecosystem_name': [ecosystem_name], + 'org_repo': [org_repo], + 'timestamp': [timestamp], + 'tag_name': [tag_name], + 'prerelease': [prerelease], + 'created_at': [created_at], + 'download_count': 0, + } + if assets and len(assets) > 0: + for asset in assets: + tag_row['download_count'] += asset.get('download_count', 0) + + overall_df = append_row(overall_df, tag_row) + + # Check pagination + if link_header and 'rel="next"' in link_header: + page += 1 + else: + break + overall_df = drop_duplicates_by_date( + overall_df, + time_column=TIME_COLUMN, + group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'], + ) + if verbose: + LOGGER.info(f'{GITHUB_DOWNLOAD_COUNT_FILENAME} tail') + LOGGER.info(overall_df.tail(5).to_string()) + + overall_df.to_csv('github_download_counts.csv', index=False) + + if not dry_run: + gfolder_path = f'{output_folder}/{GITHUB_DOWNLOAD_COUNT_FILENAME}' + create_csv(output_path=gfolder_path, data=overall_df) diff --git a/pymetrics/github.py b/pymetrics/github.py new file mode 100644 index 0000000..db8f974 --- /dev/null +++ b/pymetrics/github.py @@ -0,0 +1,80 @@ +"""Clients for making requests to Github APIs.""" + +import os + +import requests + + +class BaseClient: + """Base GitHub client.""" + + def __init__(self): + token = os.getenv('GH_ACCESS_TOKEN') + self.headers = { + 'Authorization': f'Bearer {token}', + 'Accept': 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28', + } + + +class GithubClient(BaseClient): + """Client for GitHub API.""" + + def __init__(self): + super().__init__() + self.base_url = 'https://api.github.com/repos' + + def _construct_url(self, github_org: str, repo: str, resource: str, id: str | None = None): + url = f'{self.base_url}/{github_org}/{repo}/{resource}' + if id: + url += f'/{id}' + return url + + def get( + self, + github_org: str, + repo: str, + endpoint: str, + query_params: dict | None = None, + timeout: int | None = None, + ): + """Get a specific value of a resource from an endpoint in the GitHub API. + + Args: + github_org (str): + The name of the GitHub organization to search. + repo (str): + The name of the repository to search. + endpoint (str): + The endpoint for the resource. For example, issues/{issue_number}. This means we'd + be making a request to https://api.github.com/repos/{github_org}/{repo}/issues/{issue_number}. + query_params (dict): + A dictionary mapping any query parameters to the desired value. Defaults to None. + timeout (int): + How long to wait before the request times out. Defaults to None. + + Returns: + requests.models.Response + """ + url = self._construct_url(github_org, repo, endpoint) + return requests.get(url, headers=self.headers, params=query_params, timeout=timeout) + + def post(self, github_org: str, repo: str, endpoint: str, payload: dict): + """Post to an endpooint in the GitHub API. + + Args: + github_org (str): + The name of the GitHub organization to search. + repo (str): + The name of the repository to search. + endpoint (str): + The endpoint for the resource. For example, issues. This means we'd be + making a request to https://api.github.com/repos/{github_org}/{repo}/issues. + payload (dict): + The payload to post. + + Returns: + requests.models.Response + """ + url = self._construct_url(github_org, repo, endpoint) + return requests.post(url, headers=self.headers, json=payload) diff --git a/pymetrics/metrics.py b/pymetrics/metrics.py index 154dcef..96a25a8 100644 --- a/pymetrics/metrics.py +++ b/pymetrics/metrics.py @@ -80,6 +80,9 @@ def _get_sheet_name(column): 'OS_type', 'cpu', 'ci', + 'is_prerelease', + 'is_postrelease', + 'is_devrelease', ] SORT_BY_DOWNLOADS = [ 'country_code', @@ -106,6 +109,23 @@ def _get_sheet_name(column): ] +def _safe_version_parse(version_str): + if pd.isna(version_str): + return np.nan + try: + version = Version(str(version_str)) + except InvalidVersion: + version = np.nan + return version + + +def _extract_version_attribute(version_str, attribute): + version_obj = _safe_version_parse(version_str) + if isinstance(version_obj, Version): + return getattr(version_obj, attribute) + return np.nan + + def _mangle_columns(downloads): downloads = downloads.rename(columns=RENAME_COLUMNS) for col in [ @@ -124,24 +144,17 @@ def _mangle_columns(downloads): downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version'] downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel'] - return downloads + downloads['is_prerelease'] = downloads['version'].apply( + _extract_version_attribute, args=('is_prerelease',) + ) + downloads['is_postrelease'] = downloads['version'].apply( + _extract_version_attribute, args=('is_postrelease',) + ) + downloads['is_devrelease'] = downloads['version'].apply( + _extract_version_attribute, args=('is_devrelease',) + ) - -def _safe_version_parse(version_str): - if pd.isna(version_str): - return np.nan - - try: - version = Version(str(version_str)) - except InvalidVersion: - cleaned = str(version_str).rstrip('+~') - try: - version = Version(cleaned) - except (InvalidVersion, TypeError): - LOGGER.info(f'Unable to parse version: {version_str}') - version = np.nan - - return version + return downloads def _version_order_key(version_column): diff --git a/pymetrics/pypi.py b/pymetrics/pypi.py index 84d74d7..879f545 100644 --- a/pymetrics/pypi.py +++ b/pymetrics/pypi.py @@ -1,11 +1,12 @@ """Functions to get PyPI downloads from Google Big Query.""" import logging -from datetime import datetime, timedelta, timezone +from datetime import timedelta import pandas as pd from pymetrics.bq import run_query +from pymetrics.time_utils import get_current_utc LOGGER = logging.getLogger(__name__) @@ -69,7 +70,7 @@ def _get_query(projects, start_date, end_date): def _get_query_dates(start_date, min_date, max_date, max_days, force=False): - end_date = datetime.now(timezone.utc).date() + end_date = get_current_utc().date() if start_date is None: start_date = end_date - timedelta(days=max_days) diff --git a/pymetrics/time_utils.py b/pymetrics/time_utils.py index 7e0b1e5..af4d35a 100644 --- a/pymetrics/time_utils.py +++ b/pymetrics/time_utils.py @@ -1,6 +1,7 @@ """Time utility functions.""" from datetime import datetime +from zoneinfo import ZoneInfo import pandas as pd from pandas.api.types import is_datetime64_any_dtype @@ -11,6 +12,11 @@ def get_current_year(tz=None): return datetime.now(tz=tz).year +def get_current_utc(): + """Get the current datetime UTC.""" + return datetime.now(ZoneInfo('UTC')) + + def get_first_datetime_in_year(year, tzinfo=None): """Get the first possible datetime value in a given year.""" min_date = datetime(year, day=1, month=1).date() diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py index f54ff07..8eb5fe7 100644 --- a/tests/unit/test_metrics.py +++ b/tests/unit/test_metrics.py @@ -25,17 +25,17 @@ def test__sort_by_version(): def test__sort_by_version_with_invalid_versions(): # Setup data = pd.DataFrame({ - 'version': pd.Series(['2.7.11+', '2.0.0', 'invalid', '3.0', np.nan], dtype='object'), - 'name': ['v4', 'v3', 'v2', 'v5', 'v1'], + 'version': pd.Series(['2.0.0', 'invalid', '3.0', np.nan], dtype='object'), + 'name': ['v3', 'v2', 'v4', 'v1'], }) # Run sorted_df = _sort_by_version(data, 'version') # Assert - expected_versions = ['3.0', '2.7.11+', '2.0.0', 'invalid', np.nan] + expected_versions = ['3.0', '2.0.0', 'invalid', np.nan] assert sorted_df['version'].tolist() == expected_versions - assert sorted_df['name'].tolist() == ['v5', 'v4', 'v3', 'v2', 'v1'] + assert sorted_df['name'].tolist() == ['v4', 'v3', 'v2', 'v1'] def test__sort_by_version_with_mixed_version_formats():