Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scripts/resource-yaml-training-data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
output
20 changes: 20 additions & 0 deletions scripts/resource-yaml-training-data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Resource YAML Training Data

Scrapes real-world Rill project resource YAML files from Rill Cloud for use as LLM input/training data.
Saves the output to `./scripts/resource-yaml-training-data/output/<type.txt>`.

## What it does

1. Uses `rill sudo project dump-resources --include-files` to fetch all resources of each type from Rill Cloud
2. Saves the raw JSON dumps to `output/<type>.json`
3. Formats the raw JSON dumps into a unified file of original YAML resources at `output/<type>.txt`

## Prerequisites

- Rill CLI installed and authenticated with admin access (`rill sudo` permissions)

## Usage

```bash
uv run ./scripts/resource-yaml-training-data/build.py
```
96 changes: 96 additions & 0 deletions scripts/resource-yaml-training-data/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""
Scrapes real-world Rill project resource YAML files from Rill Cloud and formats
them into text files suitable for LLM training data.
"""

import json
import os
import subprocess

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
OUTPUT_DIR = os.path.join(SCRIPT_DIR, "output")
RESOURCE_TYPES = [
"connector",
"model",
"metrics_view",
"explore",
"canvas",
"theme",
]


def main():
"""Scrape all resource types from Rill Cloud and format them as text files."""
os.makedirs(OUTPUT_DIR, exist_ok=True)

for resource_type in RESOURCE_TYPES:
scrape_resources(resource_type)
format_resources(resource_type)


def scrape_resources(resource_type):
"""Dump resources of a given type from Rill Cloud to a JSON file."""
output_path = os.path.join(OUTPUT_DIR, f"{resource_type}.json")
with open(output_path, "w") as f:
subprocess.run(
[
"rill",
"sudo",
"project",
"dump-resources",
"--include-files",
"--type",
resource_type,
],
stdout=f,
check=True,
)
print(f"Scraped {resource_type} resources to {output_path}")


def format_resources(resource_type):
"""Convert JSON dump to a readable text file with YAML content blocks."""
json_path = os.path.join(OUTPUT_DIR, f"{resource_type}.json")
txt_path = os.path.join(OUTPUT_DIR, f"{resource_type}.txt")

with open(json_path) as f:
data = json.load(f)

lines = [f"# {resource_type.replace('_', ' ').title()} examples"]
included_count = 0

for item in data:
try:
file_path = item["meta"]["filePaths"][0]
except (KeyError, IndexError, TypeError):
file_path = None
if not file_path:
continue
if file_path.endswith(".sql"):
continue

content = item.get("file_content", "")
if not content:
continue

lines.append(f"## Path: {file_path}")
if resource_type == "model":
lines.append(f"Input connector: {item['spec']['inputConnector']}")
lines.append(f"Output connector: {item['spec']['outputConnector']}")
lines.append("```yaml")
lines.append(content)
lines.append("```")
lines.append("")
included_count += 1

with open(txt_path, "w") as f:
f.write("\n".join(lines))

print(
f"Formatted {included_count}/{len(data)} {resource_type} resources to {txt_path}"
)


if __name__ == "__main__":
main()
Loading