From 1f205b32d916f183a8e9ba0ecde84f197ddc0490 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 14:44:14 +0000
Subject: [PATCH 1/3] Initial plan


From cdc3d3b71347505c2dcd81709747f49b2050ca20 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 14:48:49 +0000
Subject: [PATCH 2/3] Implement robust data processing artifact handling

- Deploy workflow: Trigger data-processing and retry artifact download on failure
- Staging-aggregate workflow: Apply same robust artifact handling
- Data-processing workflow: Fix build-resources branch to include all artifact files

Co-authored-by: LukasWallrich <60155545+LukasWallrich@users.noreply.github.com>
---
 .github/workflows/data-processing.yml    | 104 ++++++++++++++---------
 .github/workflows/deploy.yaml            |  70 +++++++++++----
 .github/workflows/staging-aggregate.yaml |  68 +++++++++++----
 3 files changed, 167 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/data-processing.yml b/.github/workflows/data-processing.yml
index 162ca2f433c..8b150db2133 100644
--- a/.github/workflows/data-processing.yml
+++ b/.github/workflows/data-processing.yml
@@ -324,16 +324,15 @@ jobs:
           
           # Store generated files in temp location
           mkdir -p /tmp/generated-resources
-          cp -r content/curated_resources /tmp/generated-resources/
-          cp content/contributors/tenzing.md /tmp/generated-resources/
           
-          # Only copy glossary if it was regenerated
-          if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then
-            echo "✓ Glossary regeneration enabled, including glossary files"
-            cp -r content/glossary /tmp/generated-resources/
-          else
-            echo "ℹ️ Glossary regeneration skipped (use workflow_dispatch with regenerate_glossary=true to update)"
-          fi
+          # Copy all files that are part of the artifact
+          cp -r content/curated_resources /tmp/generated-resources/ 2>/dev/null || true
+          cp content/contributors/tenzing.md /tmp/generated-resources/ 2>/dev/null || true
+          cp -r content/glossary /tmp/generated-resources/ 2>/dev/null || true
+          cp -r data /tmp/generated-resources/ 2>/dev/null || true
+          cp -r content/contributor-analysis /tmp/generated-resources/ 2>/dev/null || true
+          mkdir -p /tmp/generated-resources/content/publications 2>/dev/null || true
+          cp content/publications/citation_chart.webp /tmp/generated-resources/content/publications/ 2>/dev/null || true
           
           # Fetch build-resources branch (create if doesn't exist)
           git fetch origin build-resources || echo "build-resources branch doesn't exist yet"
@@ -347,55 +346,78 @@ jobs:
             git checkout -b build-resources
           fi
           
-          # Remove old generated resource files (but keep _index.md)
+          # Remove old generated resource files (but keep _index.md and other important files)
           find content/curated_resources -type f ! -name '_index.md' -delete 2>/dev/null || true
+          find content/glossary -type f ! -name '_index.md' ! -name '_create_glossaries.py' -delete 2>/dev/null || true
+          
+          # Copy newly generated files from temp location
+          # Curated resources
+          if [ -d /tmp/generated-resources/curated_resources ]; then
+            cp -r /tmp/generated-resources/curated_resources/* content/curated_resources/ 2>/dev/null || true
+          fi
+          
+          # Tenzing contributor data
+          if [ -f /tmp/generated-resources/tenzing.md ]; then
+            cp /tmp/generated-resources/tenzing.md content/contributors/
+          fi
+          
+          # Glossary files (always copy if they exist in temp)
+          if [ -d /tmp/generated-resources/glossary ]; then
+            rsync -av --exclude='_index.md' --exclude='_create_glossaries.py' /tmp/generated-resources/glossary/ content/glossary/ 2>/dev/null || true
+          fi
           
-          # Copy newly generated files
-          cp -r /tmp/generated-resources/curated_resources/* content/curated_resources/
-          cp /tmp/generated-resources/tenzing.md content/contributors/
+          # Data directory (GA data, etc.)
+          if [ -d /tmp/generated-resources/data ]; then
+            mkdir -p data
+            cp -r /tmp/generated-resources/data/* data/ 2>/dev/null || true
+          fi
+          
+          # Contributor analysis
+          if [ -d /tmp/generated-resources/contributor-analysis ]; then
+            mkdir -p content/contributor-analysis
+            cp -r /tmp/generated-resources/contributor-analysis/* content/contributor-analysis/ 2>/dev/null || true
+          fi
           
-          # Copy glossary files only if regenerated (preserving directory structure)
-          if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then
-            echo "✓ Updating glossary files in build-resources"
-            # Remove old glossary files (but keep _index.md files)
-            find content/glossary -type f ! -name '_index.md' ! -name '_create_glossaries.py' -delete 2>/dev/null || true
-            rsync -av --exclude='_index.md' --exclude='_create_glossaries.py' /tmp/generated-resources/glossary/ content/glossary/
+          # Citation chart
+          if [ -f /tmp/generated-resources/content/publications/citation_chart.webp ]; then
+            mkdir -p content/publications
+            cp /tmp/generated-resources/content/publications/citation_chart.webp content/publications/
           fi
           
+          # Add all changes (including untracked files)
+          git add -A content/curated_resources/ 2>/dev/null || true
+          git add -A content/contributors/tenzing.md 2>/dev/null || true
+          git add -A content/glossary/ 2>/dev/null || true
+          git add -A data/ 2>/dev/null || true
+          git add -A content/contributor-analysis/ 2>/dev/null || true
+          git add -A content/publications/citation_chart.webp 2>/dev/null || true
+          
           # Check if there are any changes to commit
           if git diff --quiet && git diff --cached --quiet; then
             echo "ℹ️ No changes to commit"
           else
             echo "✓ Changes detected, committing..."
             
-            # Add files based on what was regenerated
-            git add content/curated_resources/ content/contributors/tenzing.md
+            # Show what will be committed
+            echo "Files to be committed:"
+            git diff --cached --name-only
+            
+            # Commit with appropriate message
             if [ "${{ github.event.inputs.regenerate_glossary }}" = "true" ]; then
-              git add content/glossary/
               git commit -m "Update generated resources and glossary - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "Nothing to commit"
             else
               git commit -m "Update generated resources - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" || echo "Nothing to commit"
             fi
             
-            # Push to build-resources branch with retry logic
-            MAX_RETRIES=3
-            RETRY_COUNT=0
-            while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
-              if git push origin build-resources --force-with-lease; then
-                echo "✅ Successfully pushed to build-resources branch"
-                break
-              else
-                RETRY_COUNT=$((RETRY_COUNT + 1))
-                if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
-                  echo "⚠️ Push failed, retrying ($RETRY_COUNT/$MAX_RETRIES)..."
-                  sleep 2
-                  git pull origin build-resources --rebase
-                else
-                  echo "❌ Push failed after $MAX_RETRIES attempts"
-                  exit 1
-                fi
-              fi
-            done
+            # Force push to build-resources branch
+            # Using --force instead of --force-with-lease because we want to ensure
+            # all artifact files are pushed regardless of remote state
+            if git push origin build-resources --force; then
+              echo "✅ Successfully force-pushed to build-resources branch"
+            else
+              echo "❌ Failed to push to build-resources branch"
+              exit 1
+            fi
           fi
           
           # Switch back to original branch
diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
index 3ccb8b6854c..e56fed97916 100644
--- a/.github/workflows/deploy.yaml
+++ b/.github/workflows/deploy.yaml
@@ -59,30 +59,64 @@ jobs:
       # =======================
       # Fallback Data Processing
       # =======================
-      - name: Run data processing if needed
+      - name: Trigger data processing if artifact missing
         if: steps.download-artifact.outcome == 'failure'
-        env:
-          PYTHON_VERSION: "3.11"
-          GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }}
-          GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }}
         run: |
-          # Install Python dependencies
-          python3 -m pip install -r ./requirements.txt
-
-          # Generate data files
-          python3 scripts/forrt_contribs/tenzing.py
-          python3 content/resources/resource.py
-          mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md
+          echo "⚠️ Data artifact not found, triggering data-processing workflow..."
+          gh workflow run data-processing.yml --ref master
+          echo "✅ Data processing workflow triggered"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-          # Download GA data if possible
-          if [ "${{ github.event_name }}" != 'pull_request' ]; then
-            python scripts/download_ga_data.py
+      - name: Wait for data processing to complete
+        if: steps.download-artifact.outcome == 'failure'
+        run: |
+          echo "⏳ Waiting for data processing workflow to complete..."
+          
+          # Wait up to 15 minutes for the workflow to complete
+          MAX_WAIT_TIME=900  # 15 minutes in seconds
+          POLL_INTERVAL=30    # Check every 30 seconds
+          ELAPSED=0
+          
+          while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do
+            # Get the most recent data-processing workflow run
+            RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status')
             
-            # Quick validation of GA data structure
-            if [ -f "data/ga_data.json" ]; then
-              python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')"
+            if [ "$RUN_STATUS" = "completed" ]; then
+              echo "✅ Data processing workflow completed"
+              break
             fi
+            
+            echo "⏳ Still processing... (${ELAPSED}s elapsed)"
+            sleep $POLL_INTERVAL
+            ELAPSED=$((ELAPSED + POLL_INTERVAL))
+          done
+          
+          if [ $ELAPSED -ge $MAX_WAIT_TIME ]; then
+            echo "⚠️ Data processing workflow did not complete within timeout"
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Retry downloading data artifact
+        if: steps.download-artifact.outcome == 'failure'
+        id: retry-download-artifact
+        uses: dawidd6/action-download-artifact@07ab29fd4a977ae4d2b275087cf67563dfdf0295
+        with:
+          workflow: data-processing.yml
+          name: data-artifact
+          path: .
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Verify artifact availability
+        if: steps.download-artifact.outcome == 'failure'
+        run: |
+          if [ "${{ steps.retry-download-artifact.outcome }}" = "failure" ]; then
+            echo "❌ Failed to download data artifact even after triggering data processing"
+            echo "This indicates a critical issue with the data-processing workflow"
+            exit 1
           fi
+          echo "✅ Data artifact successfully downloaded after retry"
 
 
       # =======================
diff --git a/.github/workflows/staging-aggregate.yaml b/.github/workflows/staging-aggregate.yaml
index 35fe33ae73a..d50b030e0ab 100644
--- a/.github/workflows/staging-aggregate.yaml
+++ b/.github/workflows/staging-aggregate.yaml
@@ -202,28 +202,64 @@ jobs:
           path: .
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Run data processing if needed
+      - name: Trigger data processing if artifact missing
         if: steps.download-artifact.outcome == 'failure'
+        run: |
+          echo "⚠️ Data artifact not found, triggering data-processing workflow..."
+          gh workflow run data-processing.yml --ref master
+          echo "✅ Data processing workflow triggered"
         env:
-          PYTHON_VERSION: "3.11"
-          GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }}
-          GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Wait for data processing to complete
+        if: steps.download-artifact.outcome == 'failure'
         run: |
-          # Install Python dependencies
-          python3 -m pip install -r ./requirements.txt
+          echo "⏳ Waiting for data processing workflow to complete..."
+          
+          # Wait up to 15 minutes for the workflow to complete
+          MAX_WAIT_TIME=900  # 15 minutes in seconds
+          POLL_INTERVAL=30    # Check every 30 seconds
+          ELAPSED=0
+          
+          while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do
+            # Get the most recent data-processing workflow run
+            RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status')
+            
+            if [ "$RUN_STATUS" = "completed" ]; then
+              echo "✅ Data processing workflow completed"
+              break
+            fi
+            
+            echo "⏳ Still processing... (${ELAPSED}s elapsed)"
+            sleep $POLL_INTERVAL
+            ELAPSED=$((ELAPSED + POLL_INTERVAL))
+          done
+          
+          if [ $ELAPSED -ge $MAX_WAIT_TIME ]; then
+            echo "⚠️ Data processing workflow did not complete within timeout"
+          fi
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-          # Generate data files
-          python3 scripts/forrt_contribs/tenzing.py
-          python3 content/resources/resource.py
-          mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md
+      - name: Retry downloading data artifact
+        if: steps.download-artifact.outcome == 'failure'
+        id: retry-download-artifact
+        uses: dawidd6/action-download-artifact@07ab29fd4a977ae4d2b275087cf67563dfdf0295
+        with:
+          workflow: data-processing.yml
+          name: data-artifact
+          path: .
+          github_token: ${{ secrets.GITHUB_TOKEN }}
 
-          # Download GA data if possible
-          python scripts/download_ga_data.py
-          
-          # Quick validation of GA data structure
-          if [ -f "data/ga_data.json" ]; then
-            python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')"
+      - name: Verify artifact availability
+        if: steps.download-artifact.outcome == 'failure'
+        run: |
+          if [ "${{ steps.retry-download-artifact.outcome }}" = "failure" ]; then
+            echo "❌ Failed to download data artifact even after triggering data processing"
+            echo "This indicates a critical issue with the data-processing workflow"
+            exit 1
           fi
+          echo "✅ Data artifact successfully downloaded after retry"
 
 
       - name: Setup Hugo

From a3a94126ef429b962081787996e6377efa89fcb0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 9 Nov 2025 14:50:49 +0000
Subject: [PATCH 3/3] Improve workflow wait logic to handle race conditions

- Add 10s delay before polling to ensure triggered workflow appears
- Check both status and conclusion for better reporting
- Show workflow status during polling for better visibility

Co-authored-by: LukasWallrich <60155545+LukasWallrich@users.noreply.github.com>
---
 .github/workflows/deploy.yaml            | 15 +++++++++++----
 .github/workflows/staging-aggregate.yaml | 15 +++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml
index e56fed97916..6ee4b30385e 100644
--- a/.github/workflows/deploy.yaml
+++ b/.github/workflows/deploy.yaml
@@ -73,21 +73,28 @@ jobs:
         run: |
           echo "⏳ Waiting for data processing workflow to complete..."
           
+          # Small delay to ensure the workflow appears in the list
+          sleep 10
+          
           # Wait up to 15 minutes for the workflow to complete
           MAX_WAIT_TIME=900  # 15 minutes in seconds
           POLL_INTERVAL=30    # Check every 30 seconds
           ELAPSED=0
           
           while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do
-            # Get the most recent data-processing workflow run
-            RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status')
+            # Get the most recent data-processing workflow run status
+            # We check the most recent run - if it's queued or in_progress, we wait
+            # If it's completed, we're done
+            RUN_INFO=$(gh run list --workflow=data-processing.yml --limit=1 --json status,conclusion)
+            RUN_STATUS=$(echo "$RUN_INFO" | jq -r '.[0].status')
             
             if [ "$RUN_STATUS" = "completed" ]; then
-              echo "✅ Data processing workflow completed"
+              RUN_CONCLUSION=$(echo "$RUN_INFO" | jq -r '.[0].conclusion')
+              echo "✅ Data processing workflow completed with conclusion: $RUN_CONCLUSION"
               break
             fi
             
-            echo "⏳ Still processing... (${ELAPSED}s elapsed)"
+            echo "⏳ Still processing (status: $RUN_STATUS)... (${ELAPSED}s elapsed)"
             sleep $POLL_INTERVAL
             ELAPSED=$((ELAPSED + POLL_INTERVAL))
           done
diff --git a/.github/workflows/staging-aggregate.yaml b/.github/workflows/staging-aggregate.yaml
index d50b030e0ab..b9a005ad9ca 100644
--- a/.github/workflows/staging-aggregate.yaml
+++ b/.github/workflows/staging-aggregate.yaml
@@ -216,21 +216,28 @@ jobs:
         run: |
           echo "⏳ Waiting for data processing workflow to complete..."
           
+          # Small delay to ensure the workflow appears in the list
+          sleep 10
+          
           # Wait up to 15 minutes for the workflow to complete
           MAX_WAIT_TIME=900  # 15 minutes in seconds
           POLL_INTERVAL=30    # Check every 30 seconds
           ELAPSED=0
           
           while [ $ELAPSED -lt $MAX_WAIT_TIME ]; do
-            # Get the most recent data-processing workflow run
-            RUN_STATUS=$(gh run list --workflow=data-processing.yml --limit=1 --json status --jq '.[0].status')
+            # Get the most recent data-processing workflow run status
+            # We check the most recent run - if it's queued or in_progress, we wait
+            # If it's completed, we're done
+            RUN_INFO=$(gh run list --workflow=data-processing.yml --limit=1 --json status,conclusion)
+            RUN_STATUS=$(echo "$RUN_INFO" | jq -r '.[0].status')
             
             if [ "$RUN_STATUS" = "completed" ]; then
-              echo "✅ Data processing workflow completed"
+              RUN_CONCLUSION=$(echo "$RUN_INFO" | jq -r '.[0].conclusion')
+              echo "✅ Data processing workflow completed with conclusion: $RUN_CONCLUSION"
               break
             fi
             
-            echo "⏳ Still processing... (${ELAPSED}s elapsed)"
+            echo "⏳ Still processing (status: $RUN_STATUS)... (${ELAPSED}s elapsed)"
             sleep $POLL_INTERVAL
             ELAPSED=$((ELAPSED + POLL_INTERVAL))
           done