sdv-dev · gsheni · Jul 29, 2025 · Jul 25, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/.github/workflows/daily_collection.yaml b/.github/workflows/daily_collection.yaml
@@ -27,6 +27,9 @@ jobs:
     timeout-minutes: 25
     steps:
     - uses: actions/checkout@v4
+      with:
+        repository: sdv-dev/PyMetrics
+        token: ${{ secrets.GH_TOKEN }}
     - name: Install uv
       uses: astral-sh/setup-uv@v6
       with:
@@ -56,6 +59,13 @@ jobs:
       env:
         PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
         ANACONDA_OUTPUT_FOLDER: ${{ secrets.ANACONDA_OUTPUT_FOLDER }}
+    - name: Collect GitHub Downloads
+      run: |
+        uv run pymetrics collect-github \
+          --output-folder ${{ secrets.GH_OUTPUT_FOLDER }}
+      env:
+        PYDRIVE_CREDENTIALS: ${{ secrets.PYDRIVE_CREDENTIALS }}
+        GH_OUTPUT_FOLDER: ${{ secrets.GH_OUTPUT_FOLDER }}
   alert:
     needs: [collect]
     runs-on: ubuntu-latest
@@ -77,4 +87,4 @@ jobs:
         -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
         -m 'Daily Collection PyMetrics failed :fire: :dumpster-fire: :fire:'
       env:
-        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
diff --git a/.github/workflows/daily_summarization.yaml b/.github/workflows/daily_summarization.yaml
@@ -17,6 +17,9 @@ jobs:
     timeout-minutes: 10
     steps:
     - uses: actions/checkout@v4
+      with:
+        repository: sdv-dev/PyMetrics
+        token: ${{ secrets.GH_TOKEN }}
     - name: Install uv
       uses: astral-sh/setup-uv@v6
       with:
@@ -69,6 +72,6 @@ jobs:
         uv run python -m pymetrics.slack_utils \
         -r ${{ github.run_id }} \
         -c ${{ github.event.inputs.slack_channel || 'sdv-alerts' }} \
-        -m 'Daily Summarize PyMetrics failed :fire: :dumpster-fire: :fire:'
+        -m 'Daily Summarization PyMetrics failed :fire: :dumpster-fire: :fire:'
       env:
-        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
diff --git a/README.md b/README.md
@@ -44,9 +44,8 @@ Currently, the download data is collected from the following distributions:
       - Replace `{package_name}` with the specific package (`sdv`) in the Anaconda channel
     - For each file returned by the API endpoint, the current number of downloads is saved. Over time, a historical download recording can be built.
 
-### Future Data Sources
-In the future, we may expand the source distributions to include:
-* [GitHub Releases](https://github.com/): Information about the project downloads from GitHub releases.
+* [GitHub Releases](https://docs.github.com/en/rest/releases): Information about the project downloads from GitHub release assets.
+  See this [GitHub API](https://docs.github.com/en/rest/releases/releases?apiVersion=2022-11-28#get-a-release).
 
 # Install
 Install pymetrics using pip (or uv):
@@ -143,6 +142,9 @@ The aggregation metrics spreasheets contain the following tabs:
 * **By Month and Python Version:** Absolute number of downloads per month and Python version.
 * **By Month and Country Code:** Absolute number of downloads per month and country.
 * **By Month and Installer Name:** Absolute number of downloads per month and Installer.
+* **By Prerelease**: Absolute and relative number of downloads for pre-release versions (alpha, beta, release candidate, and development versions).
+* **By Postrelease**: Absolute and relative number of downloads for post-release versions.
+* **By Devrelease**: Absolute and relative number of downloads for development release versions.
 
 ## Known Issues
 1. The conda package download data for Anaconda does not match the download count shown on the website. This is due to missing download data in the conda package download data. See this: https://github.com/anaconda/anaconda-package-data/issues/45
diff --git a/github_config.yml b/github_config.yml
@@ -0,0 +1,28 @@
+projects:
+  sdv-dev:
+  - sdv-dev/SDV
+  - sdv-dev/RDT
+  - sdv-dev/SDMetrics
+  - sdv-dev/SDGym
+  - sdv-dev/Copulas
+  - sdv-dev/CTGAN
+  - sdv-dev/DeepEcho
+  gretel:
+  - gretelai/gretel-python-client
+  - gretelai/trainer
+  - gretelai/gretel-synthetics
+  mostly-ai:
+  - mostly-ai/mostlyai
+  - mostly-ai/mostlyai-mock
+  ydata:
+  - ydataai/ydata-synthetic
+  - ydataai/ydata-quality
+  - ydataai/ydata-fabric-sdk
+  realtabformer:
+  - worldbank/REaLTabFormer
+  synthcity:
+  - vanderschaarlab/synthcity
+  smartnoise-sdk:
+  - opendp/smartnoise-sdk
+  be_great:
+  - kathrinse/be_great
diff --git a/pymetrics/__main__.py b/pymetrics/__main__.py
@@ -10,6 +10,7 @@
 import yaml
 
 from pymetrics.anaconda import collect_anaconda_downloads
+from pymetrics.gh_downloads import collect_github_downloads
 from pymetrics.main import collect_pypi_downloads
 from pymetrics.summarize import summarize_downloads
 
@@ -76,6 +77,19 @@ def _collect_anaconda(args):
     )
 
 
+def _collect_github(args):
+    config = _load_config(args.config_file)
+    projects = config['projects']
+    output_folder = args.output_folder
+
+    collect_github_downloads(
+        projects=projects,
+        output_folder=output_folder,
+        dry_run=args.dry_run,
+        verbose=args.verbose,
+    )
+
+
 def _summarize(args):
     config = _load_config(args.config_file)
     projects = config['projects']
@@ -243,6 +257,29 @@ def _get_parser():
         default=90,
         help='Max days of data to pull. Default to last 90 days.',
     )
+
+    # collect GitHub downloads
+    collect_github = action.add_parser(
+        'collect-github', help='Collect download data from GitHub.', parents=[logging_args]
+    )
+    collect_github.set_defaults(action=_collect_github)
+    collect_github.add_argument(
+        '-c',
+        '--config-file',
+        type=str,
+        default='github_config.yaml',
+        help='Path to the configuration file.',
+    )
+    collect_github.add_argument(
+        '-o',
+        '--output-folder',
+        type=str,
+        required=True,
+        help=(
+            'Path to the folder where data will be outputted. It can be a local path or a'
+            ' Google Drive folder path in the format gdrive://<folder-id>'
+        ),
+    )
     return parser
 
 

diff --git a/pymetrics/anaconda.py b/pymetrics/anaconda.py
@@ -2,15 +2,14 @@
 
 import logging
 import os
-from datetime import datetime, timedelta
-from zoneinfo import ZoneInfo
+from datetime import timedelta
 
 import pandas as pd
 import requests
 from tqdm import tqdm
 
 from pymetrics.output import append_row, create_csv, get_path, load_csv
-from pymetrics.time_utils import drop_duplicates_by_date
+from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc
 
 LOGGER = logging.getLogger(__name__)
 dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -89,7 +88,7 @@ def _get_downloads_from_anaconda_org(packages, channel='conda-forge'):
 
     for pkg_name in packages:
         URL = f'https://api.anaconda.org/package/{channel}/{pkg_name}'
-        timestamp = datetime.now(ZoneInfo('UTC'))
+        timestamp = get_current_utc()
         response = requests.get(URL)
         row_info = {'pkg_name': [pkg_name], TIME_COLUMN: [timestamp], 'total_ndownloads': 0}
         data = response.json()
@@ -158,6 +157,8 @@ def collect_anaconda_downloads(
             `start_date` has not been provided. Defaults to 90 days.
         dry_run (bool):
             If `True`, do not upload the results. Defaults to `False`.
+        verbose (bool):
+            If `True`, will output dataframes tails of anaconda data. Defaults to `False`.
     """
     overall_df, version_downloads = _collect_ananconda_downloads_from_website(
         projects, output_folder=output_folder
@@ -166,7 +167,7 @@ def collect_anaconda_downloads(
     previous = _get_previous_anaconda_downloads(output_folder, filename=PREVIOUS_ANACONDA_FILENAME)
     previous = previous.sort_values(TIME_COLUMN)
 
-    end_date = datetime.now(tz=ZoneInfo('UTC')).date()
+    end_date = get_current_utc().date()
     start_date = end_date - timedelta(days=max_days)
     LOGGER.info(f'Getting daily anaconda data for start_date>={start_date} to end_date<{end_date}')
     date_ranges = pd.date_range(start=start_date, end=end_date, freq='D')

diff --git a/pymetrics/gh_downloads.py b/pymetrics/gh_downloads.py
@@ -0,0 +1,138 @@
+"""Functions to get GitHub downloads from GitHub."""
+
+import logging
+import os
+from collections import defaultdict
+
+import pandas as pd
+from tqdm import tqdm
+
+from pymetrics.github import GithubClient
+from pymetrics.output import append_row, create_csv, get_path, load_csv
+from pymetrics.time_utils import drop_duplicates_by_date, get_current_utc
+
+LOGGER = logging.getLogger(__name__)
+dir_path = os.path.dirname(os.path.realpath(__file__))
+TIME_COLUMN = 'timestamp'
+
+GITHUB_DOWNLOAD_COUNT_FILENAME = 'github_download_counts.csv'
+
+
+def get_previous_github_downloads(output_folder, dry_run=False):
+    """Get previous GitHub Downloads."""
+    csv_path = get_path(output_folder, GITHUB_DOWNLOAD_COUNT_FILENAME)
+    read_csv_kwargs = {
+        'parse_dates': [
+            TIME_COLUMN,
+            'created_at',
+        ],
+        'dtype': {
+            'ecosystem_name': pd.CategoricalDtype(),
+            'org_repo': pd.CategoricalDtype(),
+            'tag_name': pd.CategoricalDtype(),
+            'prerelease': pd.BooleanDtype(),
+            'download_count': pd.Int64Dtype(),
+        },
+    }
+    data = load_csv(csv_path, read_csv_kwargs=read_csv_kwargs)
+    return data
+
+
+def collect_github_downloads(
+    projects: dict[str, list[str]], output_folder: str, dry_run: bool = False, verbose: bool = False
+):
+    """Pull data about the downloads of a GitHub project.
+
+    Args:
+        projects (dict[str, list[str]]):
+            List of projects to analyze. Each key is the name of the ecosystem, and
+            each value is a list of github repositories (including organization).
+        output_folder (str):
+            Folder in which project downloads will be stored.
+            It can be passed as a local folder or as a Google Drive path in the format
+            `gdrive://{folder_id}`.
+            The folder must contain 'github_download_counts.csv'
+        dry_run (bool):
+            If `True`, do not upload the results. Defaults to `False`.
+        verbose (bool):
+            If `True`, will output dataframes heads of github download data. Defaults to `False`.
+    """
+    overall_df = get_previous_github_downloads(output_folder=output_folder)
+
+    gh_client = GithubClient()
+    download_counts = defaultdict(int)
+
+    for ecosystem_name, repositories in projects.items():
+        for org_repo in tqdm(repositories, position=1, desc=f'Ecosystem: {ecosystem_name}'):
+            pages_remain = True
+            page = 1
+            per_page = 100
+            download_counts[org_repo] = 0
+
+            github_org = org_repo.split('/')[0]
+            repo = org_repo.split('/')[1]
+
+            while pages_remain is True:
+                response = gh_client.get(
+                    github_org,
+                    repo,
+                    endpoint='releases',
+                    query_params={'per_page': per_page, 'page': page},
+                )
+                release_data = response.json()
+                link_header = response.headers.get('link')
+
+                if response.status_code == 404:
+                    LOGGER.debug(f'Skipping: {org_repo} because org/repo does not exist')
+                    pages_remain = False
+                    break
+
+                # Get download count
+                for release_info in tqdm(
+                    release_data, position=0, desc=f'{repo} releases, page={page}'
+                ):
+                    release_id = release_info.get('id')
+                    tag_name = release_info.get('tag_name')
+                    prerelease = release_info.get('prerelease')
+                    created_at = release_info.get('created_at')
+                    endpoint = f'releases/{release_id}'
+
+                    timestamp = get_current_utc()
+                    response = gh_client.get(github_org, repo, endpoint=endpoint)
+                    data = response.json()
+                    assets = data.get('assets')
+
+                    tag_row = {
+                        'ecosystem_name': [ecosystem_name],
+                        'org_repo': [org_repo],
+                        'timestamp': [timestamp],
+                        'tag_name': [tag_name],
+                        'prerelease': [prerelease],
+                        'created_at': [created_at],
+                        'download_count': 0,
+                    }
+                    if assets and len(assets) > 0:
+                        for asset in assets:
+                            tag_row['download_count'] += asset.get('download_count', 0)
+
+                    overall_df = append_row(overall_df, tag_row)
+
+                # Check pagination
+                if link_header and 'rel="next"' in link_header:
+                    page += 1
+                else:
+                    break
+    overall_df = drop_duplicates_by_date(
+        overall_df,
+        time_column=TIME_COLUMN,
+        group_by_columns=['ecosystem_name', 'org_repo', 'tag_name'],
+    )
+    if verbose:
+        LOGGER.info(f'{GITHUB_DOWNLOAD_COUNT_FILENAME} tail')
+        LOGGER.info(overall_df.tail(5).to_string())
+
+    overall_df.to_csv('github_download_counts.csv', index=False)
+
+    if not dry_run:
+        gfolder_path = f'{output_folder}/{GITHUB_DOWNLOAD_COUNT_FILENAME}'
+        create_csv(output_path=gfolder_path, data=overall_df)