From 1f205b32d916f183a8e9ba0ecde84f197ddc0490 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 9 Nov 2025 14:44:14 +0000 Subject: [PATCH 1/3] Initial plan From cdc3d3b71347505c2dcd81709747f49b2050ca20 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 9 Nov 2025 14:48:49 +0000 Subject: [PATCH 2/3] Implement robust data processing artifact handling - Deploy workflow: Trigger data-processing and retry artifact download on failure - Staging-aggregate workflow: Apply same robust artifact handling - Data-processing workflow: Fix build-resources branch to include all artifact files Co-authored-by: LukasWallrich <60155545+LukasWallrich@users.noreply.github.com> --- .github/workflows/data-processing.yml | 104 ++++++++++++++--------- .github/workflows/deploy.yaml | 70 +++++++++++---- .github/workflows/staging-aggregate.yaml | 68 +++++++++++---- 3 files changed, 167 insertions(+), 75 deletions(-) diff --git a/.github/workflows/data-processing.yml b/.github/workflows/data-processing.yml index 162ca2f433c..8b150db2133 100644 --- a/.github/workflows/data-processing.yml +++ b/.github/workflows/data-processing.yml @@ -324,16 +324,15 @@ jobs: # Store generated files in temp location mkdir -p /tmp/generated-resources - cp -r content/curated_resources /tmp/generated-resources/ - cp content/contributors/tenzing.md /tmp/generated-resources/ - # Only copy glossary if it was regenerated - if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then - echo "✓ Glossary regeneration enabled, including glossary files" - cp -r content/glossary /tmp/generated-resources/ - else - echo "ℹ️ Glossary regeneration skipped (use workflow_dispatch with regenerate_glossary=true to update)" - fi + # Copy all files that are part of the artifact + cp -r content/curated_resources /tmp/generated-resources/ 2>/dev/null || true + cp content/contributors/tenzing.md /tmp/generated-resources/ 2>/dev/null || true + cp -r content/glossary /tmp/generated-resources/ 2>/dev/null || true + cp -r data /tmp/generated-resources/ 2>/dev/null || true + cp -r content/contributor-analysis /tmp/generated-resources/ 2>/dev/null || true + mkdir -p /tmp/generated-resources/content/publications 2>/dev/null || true + cp content/publications/citation_chart.webp /tmp/generated-resources/content/publications/ 2>/dev/null || true # Fetch build-resources branch (create if doesn't exist) git fetch origin build-resources || echo "build-resources branch doesn't exist yet" @@ -347,55 +346,78 @@ jobs: git checkout -b build-resources fi - # Remove old generated resource files (but keep _index.md) + # Remove old generated resource files (but keep _index.md and other important files) find content/curated_resources -type f ! -name '_index.md' -delete 2>/dev/null || true + find content/glossary -type f ! -name '_index.md' ! -name '_create_glossaries.py' -delete 2>/dev/null || true + + # Copy newly generated files from temp location + # Curated resources + if [ -d /tmp/generated-resources/curated_resources ]; then + cp -r /tmp/generated-resources/curated_resources/* content/curated_resources/ 2>/dev/null || true + fi + + # Tenzing contributor data + if [ -f /tmp/generated-resources/tenzing.md ]; then + cp /tmp/generated-resources/tenzing.md content/contributors/ + fi + + # Glossary files (always copy if they exist in temp) + if [ -d /tmp/generated-resources/glossary ]; then + rsync -av --exclude='_index.md' --exclude='_create_glossaries.py' /tmp/generated-resources/glossary/ content/glossary/ 2>/dev/null || true + fi - # Copy newly generated files - cp -r /tmp/generated-resources/curated_resources/* content/curated_resources/ - cp /tmp/generated-resources/tenzing.md content/contributors/ + # Data directory (GA data, etc.) + if [ -d /tmp/generated-resources/data ]; then + mkdir -p data + cp -r /tmp/generated-resources/data/* data/ 2>/dev/null || true + fi + + # Contributor analysis + if [ -d /tmp/generated-resources/contributor-analysis ]; then + mkdir -p content/contributor-analysis + cp -r /tmp/generated-resources/contributor-analysis/* content/contributor-analysis/ 2>/dev/null || true + fi - # Copy glossary files only if regenerated (preserving directory structure) - if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then - echo "✓ Updating glossary files in build-resources" - # Remove old glossary files (but keep _index.md files) - find content/glossary -type f ! -name '_index.md' ! -name '_create_glossaries.py' -delete 2>/dev/null || true - rsync -av --exclude='_index.md' --exclude='_create_glossaries.py' /tmp/generated-resources/glossary/ content/glossary/ + # Citation chart + if [ -f /tmp/generated-resources/content/publications/citation_chart.webp ]; then + mkdir -p content/publications + cp /tmp/generated-resources/content/publications/citation_chart.webp content/publications/ fi + # Add all changes (including untracked files) + git add -A content/curated_resources/ 2>/dev/null || true + git add -A content/contributors/tenzing.md 2>/dev/null || true + git add -A content/glossary/ 2>/dev/null || true + git add -A data/ 2>/dev/null || true + git add -A content/contributor-analysis/ 2>/dev/null || true + git add -A content/publications/citation_chart.webp 2>/dev/null || true + # Check if there are any changes to commit if git diff --quiet && git diff --cached --quiet; then echo "ℹ️ No changes to commit" else echo "✓ Changes detected, committing..." - # Add files based on what was regenerated - git add content/curated_resources/ content/contributors/tenzing.md + # Show what will be committed + echo "Files to be committed:" + git diff --cached --name-only + + # Commit with appropriate message if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then - git add content/glossary/ git commit -m "Update generated resources and glossary - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "Nothing to commit" else git commit -m "Update generated resources - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "Nothing to commit" fi - # Push to build-resources branch with retry logic - MAX_RETRIES=3 - RETRY_COUNT=0 - while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do - if git push origin build-resources --force-with-lease; then - echo "✅ Successfully pushed to build-resources branch" - break - else - RETRY_COUNT=$((RETRY_COUNT + 1)) - if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then - echo "⚠️ Push failed, retrying ($RETRY_COUNT/$MAX_RETRIES)..." - sleep 2 - git pull origin build-resources --rebase - else - echo "❌ Push failed after $MAX_RETRIES attempts" - exit 1 - fi - fi - done + # Force push to build-resources branch + # Using --force instead of --force-with-lease because we want to ensure + # all artifact files are pushed regardless of remote state + if git push origin build-resources --force; then + echo "✅ Successfully force-pushed to build-resources branch" + else + echo "❌ Failed to push to build-resources branch" + exit 1 + fi fi # Switch back to original branch diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 3ccb8b6854c..e56fed97916 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -59,30 +59,64 @@ jobs: # ======================= # Fallback Data Processing # ======================= - - name: Run data processing if needed + - name: Trigger data processing if artifact missing if: steps.download-artifact.outcome == 'failure' - env: - PYTHON_VERSION: "3.11" - GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }} - GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }} run: | - # Install Python dependencies - python3 -m pip install -r ./requirements.txt - - # Generate data files - python3 scripts/forrt_contribs/tenzing.py - python3 content/resources/resource.py - mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md + echo "⚠️ Data artifact not found, triggering data-processing workflow..." + gh workflow run data-processing.yml --ref master + echo "✅ Data processing workflow triggered" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # Download GA data if possible - if [ "${{ github.event_name }}" != 'pull_request' ]; then - python scripts/download_ga_data.py + - name: Wait for data processing to complete + if: steps.download-artifact.outcome == 'failure' + run: | + echo "⏳ Waiting for data processing workflow to complete..." + + # Wait up to 15 minutes for the workflow to complete + MAX_WAIT_TIME=900 # 15 minutes in seconds + POLL_INTERVAL=30 # Check every 30 seconds + ELAPSED=0 + + while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do + # Get the most recent data-processing workflow run + RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status') - # Quick validation of GA data structure - if [ -f "data/ga_data.json" ]; then - python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')" + if [ "$RUN_STATUS" = "completed" ]; then + echo "✅ Data processing workflow completed" + break fi + + echo "⏳ Still processing... (${ELAPSED}s elapsed)" + sleep $POLL_INTERVAL + ELAPSED=$((ELAPSED + POLL_INTERVAL)) + done + + if [ $ELAPSED -ge $MAX_WAIT_TIME ]; then + echo "⚠️ Data processing workflow did not complete within timeout" + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Retry downloading data artifact + if: steps.download-artifact.outcome == 'failure' + id: retry-download-artifact + uses: dawidd6/action-download-artifact@07ab29fd4a977ae4d2b275087cf67563dfdf0295 + with: + workflow: data-processing.yml + name: data-artifact + path: . + github_token: ${{ secrets.GITHUB_TOKEN }} + + - name: Verify artifact availability + if: steps.download-artifact.outcome == 'failure' + run: | + if [ "${{ steps.retry-download-artifact.outcome }}" = "failure" ]; then + echo "❌ Failed to download data artifact even after triggering data processing" + echo "This indicates a critical issue with the data-processing workflow" + exit 1 fi + echo "✅ Data artifact successfully downloaded after retry" # ======================= diff --git a/.github/workflows/staging-aggregate.yaml b/.github/workflows/staging-aggregate.yaml index 35fe33ae73a..d50b030e0ab 100644 --- a/.github/workflows/staging-aggregate.yaml +++ b/.github/workflows/staging-aggregate.yaml @@ -202,28 +202,64 @@ jobs: path: . github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Run data processing if needed + - name: Trigger data processing if artifact missing if: steps.download-artifact.outcome == 'failure' + run: | + echo "⚠️ Data artifact not found, triggering data-processing workflow..." + gh workflow run data-processing.yml --ref master + echo "✅ Data processing workflow triggered" env: - PYTHON_VERSION: "3.11" - GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }} - GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Wait for data processing to complete + if: steps.download-artifact.outcome == 'failure' run: | - # Install Python dependencies - python3 -m pip install -r ./requirements.txt + echo "⏳ Waiting for data processing workflow to complete..." + + # Wait up to 15 minutes for the workflow to complete + MAX_WAIT_TIME=900 # 15 minutes in seconds + POLL_INTERVAL=30 # Check every 30 seconds + ELAPSED=0 + + while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do + # Get the most recent data-processing workflow run + RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status') + + if [ "$RUN_STATUS" = "completed" ]; then + echo "✅ Data processing workflow completed" + break + fi + + echo "⏳ Still processing... (${ELAPSED}s elapsed)" + sleep $POLL_INTERVAL + ELAPSED=$((ELAPSED + POLL_INTERVAL)) + done + + if [ $ELAPSED -ge $MAX_WAIT_TIME ]; then + echo "⚠️ Data processing workflow did not complete within timeout" + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # Generate data files - python3 scripts/forrt_contribs/tenzing.py - python3 content/resources/resource.py - mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md + - name: Retry downloading data artifact + if: steps.download-artifact.outcome == 'failure' + id: retry-download-artifact + uses: dawidd6/action-download-artifact@07ab29fd4a977ae4d2b275087cf67563dfdf0295 + with: + workflow: data-processing.yml + name: data-artifact + path: . + github_token: ${{ secrets.GITHUB_TOKEN }} - # Download GA data if possible - python scripts/download_ga_data.py - - # Quick validation of GA data structure - if [ -f "data/ga_data.json" ]; then - python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')" + - name: Verify artifact availability + if: steps.download-artifact.outcome == 'failure' + run: | + if [ "${{ steps.retry-download-artifact.outcome }}" = "failure" ]; then + echo "❌ Failed to download data artifact even after triggering data processing" + echo "This indicates a critical issue with the data-processing workflow" + exit 1 fi + echo "✅ Data artifact successfully downloaded after retry" - name: Setup Hugo From a3a94126ef429b962081787996e6377efa89fcb0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 9 Nov 2025 14:50:49 +0000 Subject: [PATCH 3/3] Improve workflow wait logic to handle race conditions - Add 10s delay before polling to ensure triggered workflow appears - Check both status and conclusion for better reporting - Show workflow status during polling for better visibility Co-authored-by: LukasWallrich <60155545+LukasWallrich@users.noreply.github.com> --- .github/workflows/deploy.yaml | 15 +++++++++++---- .github/workflows/staging-aggregate.yaml | 15 +++++++++++---- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index e56fed97916..6ee4b30385e 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -73,21 +73,28 @@ jobs: run: | echo "⏳ Waiting for data processing workflow to complete..." + # Small delay to ensure the workflow appears in the list + sleep 10 + # Wait up to 15 minutes for the workflow to complete MAX_WAIT_TIME=900 # 15 minutes in seconds POLL_INTERVAL=30 # Check every 30 seconds ELAPSED=0 while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do - # Get the most recent data-processing workflow run - RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status') + # Get the most recent data-processing workflow run status + # We check the most recent run - if it's queued or in_progress, we wait + # If it's completed, we're done + RUN_INFO=$(gh run list --workflow=data-processing.yml --limit=1 --json status,conclusion) + RUN_STATUS=$(echo "$RUN_INFO" | jq -r '.[0].status') if [ "$RUN_STATUS" = "completed" ]; then - echo "✅ Data processing workflow completed" + RUN_CONCLUSION=$(echo "$RUN_INFO" | jq -r '.[0].conclusion') + echo "✅ Data processing workflow completed with conclusion: $RUN_CONCLUSION" break fi - echo "⏳ Still processing... (${ELAPSED}s elapsed)" + echo "⏳ Still processing (status: $RUN_STATUS)... (${ELAPSED}s elapsed)" sleep $POLL_INTERVAL ELAPSED=$((ELAPSED + POLL_INTERVAL)) done diff --git a/.github/workflows/staging-aggregate.yaml b/.github/workflows/staging-aggregate.yaml index d50b030e0ab..b9a005ad9ca 100644 --- a/.github/workflows/staging-aggregate.yaml +++ b/.github/workflows/staging-aggregate.yaml @@ -216,21 +216,28 @@ jobs: run: | echo "⏳ Waiting for data processing workflow to complete..." + # Small delay to ensure the workflow appears in the list + sleep 10 + # Wait up to 15 minutes for the workflow to complete MAX_WAIT_TIME=900 # 15 minutes in seconds POLL_INTERVAL=30 # Check every 30 seconds ELAPSED=0 while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do - # Get the most recent data-processing workflow run - RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status') + # Get the most recent data-processing workflow run status + # We check the most recent run - if it's queued or in_progress, we wait + # If it's completed, we're done + RUN_INFO=$(gh run list --workflow=data-processing.yml --limit=1 --json status,conclusion) + RUN_STATUS=$(echo "$RUN_INFO" | jq -r '.[0].status') if [ "$RUN_STATUS" = "completed" ]; then - echo "✅ Data processing workflow completed" + RUN_CONCLUSION=$(echo "$RUN_INFO" | jq -r '.[0].conclusion') + echo "✅ Data processing workflow completed with conclusion: $RUN_CONCLUSION" break fi - echo "⏳ Still processing... (${ELAPSED}s elapsed)" + echo "⏳ Still processing (status: $RUN_STATUS)... (${ELAPSED}s elapsed)" sleep $POLL_INTERVAL ELAPSED=$((ELAPSED + POLL_INTERVAL)) done