From 1a5e2da33ae7dcf30056cf06fbffb76b2dd5b7d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Egelund-M=C3=BCller?= Date: Thu, 8 Jan 2026 12:41:04 +0100 Subject: [PATCH 1/2] Script for scraping resource YAML files from Rill Cloud --- .../resource-yaml-training-data/.gitignore | 1 + scripts/resource-yaml-training-data/README.md | 20 +++++ scripts/resource-yaml-training-data/build.py | 89 +++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100644 scripts/resource-yaml-training-data/.gitignore create mode 100644 scripts/resource-yaml-training-data/README.md create mode 100644 scripts/resource-yaml-training-data/build.py diff --git a/scripts/resource-yaml-training-data/.gitignore b/scripts/resource-yaml-training-data/.gitignore new file mode 100644 index 00000000000..6caf68aff42 --- /dev/null +++ b/scripts/resource-yaml-training-data/.gitignore @@ -0,0 +1 @@ +output \ No newline at end of file diff --git a/scripts/resource-yaml-training-data/README.md b/scripts/resource-yaml-training-data/README.md new file mode 100644 index 00000000000..c652c361935 --- /dev/null +++ b/scripts/resource-yaml-training-data/README.md @@ -0,0 +1,20 @@ +# Resource YAML Training Data + +Scrapes real-world Rill project resource YAML files from Rill Cloud for use as LLM input/training data. +Saves the output to `./scripts/resource-yaml-training-data/output/`. + +## What it does + +1. Uses `rill sudo project dump-resources --include-files` to fetch all resources of each type from Rill Cloud +2. Saves the raw JSON dumps to `output/.json` +3. Formats the raw JSON dumps into a unified file of original YAML resources at `output/.txt` + +## Prerequisites + +- Rill CLI installed and authenticated with admin access (`rill sudo` permissions) + +## Usage + +```bash +uv run ./scripts/resource-yaml-training-data/build.py +``` diff --git a/scripts/resource-yaml-training-data/build.py b/scripts/resource-yaml-training-data/build.py new file mode 100644 index 00000000000..3875e3fa706 --- /dev/null +++ b/scripts/resource-yaml-training-data/build.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Scrapes real-world Rill project resource YAML files from Rill Cloud and formats +them into text files suitable for LLM training data. +""" + +import json +import os +import subprocess + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output") +RESOURCE_TYPES = ["connector", "model", "metrics_view", "explore", "canvas"] + + +def main(): + """Scrape all resource types from Rill Cloud and format them as text files.""" + os.makedirs(OUTPUT_DIR, exist_ok=True) + + for resource_type in RESOURCE_TYPES: + scrape_resources(resource_type) + format_resources(resource_type) + + +def scrape_resources(resource_type): + """Dump resources of a given type from Rill Cloud to a JSON file.""" + output_path = os.path.join(OUTPUT_DIR, f"{resource_type}.json") + with open(output_path, "w") as f: + subprocess.run( + [ + "rill", + "sudo", + "project", + "dump-resources", + "--include-files", + "--type", + resource_type, + ], + stdout=f, + check=True, + ) + print(f"Scraped {resource_type} resources to {output_path}") + + +def format_resources(resource_type): + """Convert JSON dump to a readable text file with YAML content blocks.""" + json_path = os.path.join(OUTPUT_DIR, f"{resource_type}.json") + txt_path = os.path.join(OUTPUT_DIR, f"{resource_type}.txt") + + with open(json_path) as f: + data = json.load(f) + + lines = [f"# {resource_type.replace('_', ' ').title()} examples"] + included_count = 0 + + for item in data: + try: + file_path = item["meta"]["filePaths"][0] + except (KeyError, IndexError, TypeError): + file_path = None + if not file_path: + continue + if file_path.endswith(".sql"): + continue + + content = item.get("file_content", "") + if not content: + continue + + lines.append(f"## Path: {file_path}") + if resource_type == "model": + lines.append(f"Input connector: {item['spec']['inputConnector']}") + lines.append(f"Output connector: {item['spec']['outputConnector']}") + lines.append("```yaml") + lines.append(content) + lines.append("```") + lines.append("") + included_count += 1 + + with open(txt_path, "w") as f: + f.write("\n".join(lines)) + + print( + f"Formatted {included_count}/{len(data)} {resource_type} resources to {txt_path}" + ) + + +if __name__ == "__main__": + main() From 774be0b605eebe49fade152629a91e1cf24ac505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Benjamin=20Egelund-M=C3=BCller?= Date: Thu, 8 Jan 2026 17:41:03 +0100 Subject: [PATCH 2/2] Add theme --- scripts/resource-yaml-training-data/build.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/resource-yaml-training-data/build.py b/scripts/resource-yaml-training-data/build.py index 3875e3fa706..ec38cc517bf 100644 --- a/scripts/resource-yaml-training-data/build.py +++ b/scripts/resource-yaml-training-data/build.py @@ -10,7 +10,14 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output") -RESOURCE_TYPES = ["connector", "model", "metrics_view", "explore", "canvas"] +RESOURCE_TYPES = [ + "connector", + "model", + "metrics_view", + "explore", + "canvas", + "theme", +] def main():