From 1a5e2da33ae7dcf30056cf06fbffb76b2dd5b7d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Benjamin=20Egelund-M=C3=BCller?= <b@egelund-muller.com>
Date: Thu, 8 Jan 2026 12:41:04 +0100
Subject: [PATCH 1/2] Script for scraping resource YAML files from Rill Cloud

---
 .../resource-yaml-training-data/.gitignore    |  1 +
 scripts/resource-yaml-training-data/README.md | 20 +++++
 scripts/resource-yaml-training-data/build.py  | 89 +++++++++++++++++++
 3 files changed, 110 insertions(+)
 create mode 100644 scripts/resource-yaml-training-data/.gitignore
 create mode 100644 scripts/resource-yaml-training-data/README.md
 create mode 100644 scripts/resource-yaml-training-data/build.py
diff --git a/scripts/resource-yaml-training-data/.gitignore b/scripts/resource-yaml-training-data/.gitignore
new file mode 100644
index 00000000000..6caf68aff42
--- /dev/null
+++ b/scripts/resource-yaml-training-data/.gitignore
@@ -0,0 +1 @@
+output
\ No newline at end of file
diff --git a/scripts/resource-yaml-training-data/README.md b/scripts/resource-yaml-training-data/README.md
new file mode 100644
index 00000000000..c652c361935
--- /dev/null
+++ b/scripts/resource-yaml-training-data/README.md
@@ -0,0 +1,20 @@
+# Resource YAML Training Data
+
+Scrapes real-world Rill project resource YAML files from Rill Cloud for use as LLM input/training data.
+Saves the output to `./scripts/resource-yaml-training-data/output/<type.txt>`.
+
+## What it does
+
+1. Uses `rill sudo project dump-resources --include-files` to fetch all resources of each type from Rill Cloud
+2. Saves the raw JSON dumps to `output/<type>.json`
+3. Formats the raw JSON dumps into a unified file of original YAML resources at `output/<type>.txt`
+
+## Prerequisites
+
+- Rill CLI installed and authenticated with admin access (`rill sudo` permissions)
+
+## Usage
+
+```bash
+uv run ./scripts/resource-yaml-training-data/build.py
+```
diff --git a/scripts/resource-yaml-training-data/build.py b/scripts/resource-yaml-training-data/build.py
new file mode 100644
index 00000000000..3875e3fa706
--- /dev/null
+++ b/scripts/resource-yaml-training-data/build.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Scrapes real-world Rill project resource YAML files from Rill Cloud and formats
+them into text files suitable for LLM training data.
+"""
+
+import json
+import os
+import subprocess
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
+RESOURCE_TYPES = ["connector", "model", "metrics_view", "explore", "canvas"]
+
+
+def main():
+    """Scrape all resource types from Rill Cloud and format them as text files."""
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    for resource_type in RESOURCE_TYPES:
+        scrape_resources(resource_type)
+        format_resources(resource_type)
+
+
+def scrape_resources(resource_type):
+    """Dump resources of a given type from Rill Cloud to a JSON file."""
+    output_path = os.path.join(OUTPUT_DIR, f"{resource_type}.json")
+    with open(output_path, "w") as f:
+        subprocess.run(
+            [
+                "rill",
+                "sudo",
+                "project",
+                "dump-resources",
+                "--include-files",
+                "--type",
+                resource_type,
+            ],
+            stdout=f,
+            check=True,
+        )
+    print(f"Scraped {resource_type} resources to {output_path}")
+
+
+def format_resources(resource_type):
+    """Convert JSON dump to a readable text file with YAML content blocks."""
+    json_path = os.path.join(OUTPUT_DIR, f"{resource_type}.json")
+    txt_path = os.path.join(OUTPUT_DIR, f"{resource_type}.txt")
+
+    with open(json_path) as f:
+        data = json.load(f)
+
+    lines = [f"# {resource_type.replace('_', ' ').title()} examples"]
+    included_count = 0
+
+    for item in data:
+        try:
+            file_path = item["meta"]["filePaths"][0]
+        except (KeyError, IndexError, TypeError):
+            file_path = None
+        if not file_path:
+            continue
+        if file_path.endswith(".sql"):
+            continue
+
+        content = item.get("file_content", "")
+        if not content:
+            continue
+
+        lines.append(f"## Path: {file_path}")
+        if resource_type == "model":
+            lines.append(f"Input connector: {item['spec']['inputConnector']}")
+            lines.append(f"Output connector: {item['spec']['outputConnector']}")
+        lines.append("```yaml")
+        lines.append(content)
+        lines.append("```")
+        lines.append("")
+        included_count += 1
+
+    with open(txt_path, "w") as f:
+        f.write("\n".join(lines))
+
+    print(
+        f"Formatted {included_count}/{len(data)} {resource_type} resources to {txt_path}"
+    )
+
+
+if __name__ == "__main__":
+    main()

From 774be0b605eebe49fade152629a91e1cf24ac505 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Benjamin=20Egelund-M=C3=BCller?= <b@egelund-muller.com>
Date: Thu, 8 Jan 2026 17:41:03 +0100
Subject: [PATCH 2/2] Add theme

---
 scripts/resource-yaml-training-data/build.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/scripts/resource-yaml-training-data/build.py b/scripts/resource-yaml-training-data/build.py
index 3875e3fa706..ec38cc517bf 100644
--- a/scripts/resource-yaml-training-data/build.py
+++ b/scripts/resource-yaml-training-data/build.py
@@ -10,7 +10,14 @@
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
-RESOURCE_TYPES = ["connector", "model", "metrics_view", "explore", "canvas"]
+RESOURCE_TYPES = [
+    "connector",
+    "model",
+    "metrics_view",
+    "explore",
+    "canvas",
+    "theme",
+]
 
 
 def main():