Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/code_checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,10 @@ jobs:
uses: pypa/gh-action-pip-audit@1220774d901786e6f652ae159f7b6bc8fea6d266
with:
virtual-environment: .venv/
# Skipping one nbconvert vulnerability that has no fix version
# Skipping one orjson vulnerability that has no fix version
# Skipping one protobuf vulnerability that has no fix version
ignore-vulns: |
GHSA-xm59-rqc7-hhvf
GHSA-hx9q-6w63-j58v
GHSA-7gcm-g887-7qv7
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,6 @@ wheels/
**.ipynb_checkpoints

.env
.gradio
aieng-eval-agents/aieng/agent_evals/report_generation/data/*.db
aieng-eval-agents/aieng/agent_evals/report_generation/reports/*
193 changes: 193 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/async_client_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""Async client lifecycle manager for Gradio applications.

Provides idempotent initialization and proper cleanup of async clients
like Weaviate and OpenAI to prevent event loop conflicts during Gradio's
hot-reload process.
"""

import os
import sqlite3
import urllib.parse
from pathlib import Path
from typing import Any

import pandas as pd
from aieng.agent_evals.configs import Configs
from openai import AsyncOpenAI
from weaviate.client import WeaviateAsyncClient


# Will use these as default if no path is provided in the
# REPORT_GENERATION_DB_PATH and REPORTS_OUTPUT_PATH env vars
DEFAULT_SQLITE_DB_PATH = Path("aieng-eval-agents/aieng/agent_evals/report_generation/data/OnlineRetail.db")
DEFAULT_REPORTS_OUTPUT_PATH = Path("aieng-eval-agents/aieng/agent_evals/report_generation/reports/")


class SQLiteConnection:
"""SQLite connection."""

def __init__(self) -> None:
db_path = os.getenv("REPORT_GENERATION_DB_PATH", DEFAULT_SQLITE_DB_PATH)
self._connection = sqlite3.connect(db_path)

def execute(self, query: str) -> list[Any]:
"""Execute a SQLite query.

Args:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpicky here, but isn't this google docstring stryle, but then the Returns and other places in the PR looks like numpy style. I prefer numpy style but we should just pick one and be consistent.

query: The SQLite query to execute.

Returns
-------
The result of the query. Will return the result of
`execute(query).fetchall()`.
"""
return self._connection.execute(query).fetchall()

def close(self) -> None:
"""Close the SQLite connection."""
self._connection.close()


class ReportFileWriter:
"""Write reports to a file."""

def write_report_to_file(
self,
report_data: list[Any],
report_columns: list[str],
filename: str = "report.xlsx",
gradio_link: bool = True,
) -> str:
"""Write a report to a XLSX file.

Args:
report_data: The data of the report
report_columns: The columns of the report
filename: The name of the file to create. Default is "report.xlsx".
gradio_link: Whether to return a file link that works with Gradio UI.
Default is True.

Returns
-------
The path to the report file. If `gradio_link` is True, will return
a URL link that allows Gradio UI to donwload the file.
"""
# Create reports directory if it doesn't exist
reports_output_path = self.get_reports_output_path()
reports_output_path.mkdir(exist_ok=True)
filepath = reports_output_path / filename

report_df = pd.DataFrame(report_data, columns=report_columns)
report_df.to_excel(filepath, index=False)

file_uri = str(filepath)
if gradio_link:
file_uri = f"gradio_api/file={urllib.parse.quote(str(file_uri), safe='')}"

return file_uri

@staticmethod
def get_reports_output_path() -> Path:
"""Get the reports output path.

If no path is provided in the REPORTS_OUTPUT_PATH env var, will use the
default path in DEFAULT_REPORTS_OUTPUT_PATH.

Returns
-------
The reports output path.
"""
return Path(os.getenv("REPORTS_OUTPUT_PATH", DEFAULT_REPORTS_OUTPUT_PATH))


class AsyncClientManager:
"""Manages async client lifecycle with lazy initialization and cleanup.

This class ensures clients are created only once and properly closed,
preventing ResourceWarning errors from unclosed event loops.

Parameters
----------
configs: Configs | None, optional, default=None
Configuration object for client setup. If None, a new ``Configs()`` is created.

Examples
--------
>>> manager = AsyncClientManager()
>>> # Access clients (created on first access)
>>> weaviate = manager.weaviate_client
>>> kb = manager.knowledgebase
>>> openai = manager.openai_client
>>> # In finally block or cleanup
>>> await manager.close()
"""

_singleton_instance: "AsyncClientManager | None" = None

@classmethod
def get_instance(cls) -> "AsyncClientManager":
"""Get the singleton instance of the client manager.

Returns
-------
The singleton instance of the client manager.
"""
if cls._singleton_instance is None:
cls._singleton_instance = AsyncClientManager()
return cls._singleton_instance

def __init__(self, configs: Configs | None = None) -> None:
"""Initialize manager with optional configs."""
self._configs: Configs | None = configs
self._weaviate_client: WeaviateAsyncClient | None = None
self._openai_client: AsyncOpenAI | None = None
self._sqlite_connection: SQLiteConnection | None = None
self._report_file_writer: ReportFileWriter | None = None
self._initialized: bool = False

@property
def configs(self) -> Configs:
"""Get or create configs instance."""
if self._configs is None:
self._configs = Configs() # pyright: ignore[reportCallIssue]
return self._configs

@property
def openai_client(self) -> AsyncOpenAI:
"""Get or create OpenAI client."""
if self._openai_client is None:
self._openai_client = AsyncOpenAI()
self._initialized = True
return self._openai_client

@property
def sqlite_connection(self) -> SQLiteConnection:
"""Get or create SQLite session."""
if self._sqlite_connection is None:
self._sqlite_connection = SQLiteConnection()
self._initialized = True
return self._sqlite_connection

@property
def report_file_writer(self) -> ReportFileWriter:
"""Get or create ReportFileWriter."""
if self._report_file_writer is None:
self._report_file_writer = ReportFileWriter()
self._initialized = True
return self._report_file_writer

async def close(self) -> None:
"""Close all initialized async clients."""
if self._openai_client is not None:
await self._openai_client.close()
self._openai_client = None

if self._sqlite_connection is not None:
self._sqlite_connection.close()
self._sqlite_connection = None

self._initialized = False

def is_initialized(self) -> bool:
"""Check if any clients have been initialized."""
return self._initialized
109 changes: 109 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Configuration settings for the agent evals."""

from pydantic import AliasChoices, Field
from pydantic_settings import BaseSettings, SettingsConfigDict


class Configs(BaseSettings):
"""Configuration settings loaded from environment variables.

This class automatically loads configuration values from environment variables
and a .env file, and provides type-safe access to all settings. It validates
environment variables on instantiation.

Attributes
----------
openai_base_url : str
Base URL for OpenAI-compatible API (defaults to Gemini endpoint).
openai_api_key : str
API key for OpenAI-compatible API (accepts OPENAI_API_KEY, GEMINI_API_KEY,
or GOOGLE_API_KEY).
default_planner_model : str, default='gemini-2.5-pro'
Model name for planning tasks. This is typically a more capable and expensive
model.
default_worker_model : str, default='gemini-2.5-flash'
Model name for worker tasks. This is typically a less expensive model.
embedding_base_url : str
Base URL for embedding API service.
embedding_api_key : str
API key for embedding service.
embedding_model_name : str, default='@cf/baai/bge-m3'
Name of the embedding model.
weaviate_collection_name : str, default='enwiki_20250520'
Name of the Weaviate collection to use.
weaviate_api_key : str
API key for Weaviate cloud instance.
weaviate_http_host : str
Weaviate HTTP host (must end with .weaviate.cloud).
weaviate_grpc_host : str
Weaviate gRPC host (must start with grpc- and end with .weaviate.cloud).
weaviate_http_port : int, default=443
Port for Weaviate HTTP connections.
weaviate_grpc_port : int, default=443
Port for Weaviate gRPC connections.
weaviate_http_secure : bool, default=True
Use secure HTTP connection.
weaviate_grpc_secure : bool, default=True
Use secure gRPC connection.
langfuse_public_key : str
Langfuse public key (must start with pk-lf-).
langfuse_secret_key : str
Langfuse secret key (must start with sk-lf-).
langfuse_host : str, default='https://us.cloud.langfuse.com'
Langfuse host URL.
e2b_api_key : str or None
Optional E2B.dev API key for code interpreter (must start with e2b_).
default_code_interpreter_template : str or None
Optional default template name or ID for E2B.dev code interpreter.
web_search_base_url : str or None
Optional base URL for web search service.
web_search_api_key : str or None
Optional API key for web search service.

Examples
--------
>>> from src.utils.env_vars import Configs
>>> config = Configs()
>>> print(config.default_planner_model)
'gemini-2.5-pro'

Notes
-----
Create a .env file in your project root with the required environment
variables. The class will automatically load and validate them.
"""

model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", env_ignore_empty=True)

openai_base_url: str = "https://generativelanguage.googleapis.com/v1beta/openai/"
openai_api_key: str = Field(validation_alias=AliasChoices("OPENAI_API_KEY", "GEMINI_API_KEY", "GOOGLE_API_KEY"))

default_planner_model: str = "gemini-2.5-pro"
default_worker_model: str = "gemini-2.5-flash"

embedding_base_url: str
embedding_api_key: str
embedding_model_name: str = "@cf/baai/bge-m3"

weaviate_collection_name: str = "enwiki_20250520"
weaviate_api_key: str | None = None
# ends with .weaviate.cloud, or it's "localhost"
weaviate_http_host: str = Field(pattern=r"^.*\.weaviate\.cloud$|localhost")
# starts with grpc- ends with .weaviate.cloud, or it's "localhost"
weaviate_grpc_host: str = Field(pattern=r"^grpc-.*\.weaviate\.cloud$|localhost")
weaviate_http_port: int = 443
weaviate_grpc_port: int = 443
weaviate_http_secure: bool = True
weaviate_grpc_secure: bool = True

langfuse_public_key: str = Field(pattern=r"^pk-lf-.*$")
langfuse_secret_key: str = Field(pattern=r"^sk-lf-.*$")
langfuse_host: str = "https://us.cloud.langfuse.com"

# Optional E2B.dev API key for Python Code Interpreter tool
e2b_api_key: str | None = Field(default=None, pattern=r"^e2b_.*$")
default_code_interpreter_template: str | None = "9p6favrrqijhasgkq1tv"

# Optional configs for web search tool
web_search_base_url: str | None = None
web_search_api_key: str | None = None
52 changes: 52 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/report_generation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Report Generation Agent

This code implements an example of a Report Generation Agent for single-table relational
data source.

The data source implemented here is [SQLite](https://sqlite.org/) which is supported
natively by Python and saves the data in disk.

The Report Generation Agent will provide an UI to read user queries in natural language
and procceed to make SQL queries to the database in order to produce the data for
the report. At the end, the Agent will provide a downloadable link to the report as
an `.xlsx` file.

## Dataset

The dataset used in this example is the
[Online Retail](https://archive.ics.uci.edu/dataset/352/online+retail) dataset. It contains
information about invoices for products that were purchased by customers, which also includes
product quantity, the invoice date and country that the user resides in. For a more
detailed data structure, please check the [OnlineRetail.ddl](data/Online%20Retail.ddl) file.

## Importing the Data

To import the data, pleasde download the dataset file from the link below and save it to your
file system.

https://archive.ics.uci.edu/static/public/352/online+retail.zip

You can import the dataset to the database by running the script below:

```bash
uv run --env-file .env python -m aieng.agent_evals.report_generation.data.import_online_retail_data --dataset-path <path_to_the_csv_file>
```

Replace `<path_to_the_csv_file>` with the path the dataset's .CSV file is saved in your machine.

***NOTE:*** You can configure the location the database is saved by setting the path to
an environment variable named `REPORT_GENERATION_DB_PATH`.

## Running

To run the agent, please execute:

```bash
uv run --env-file .env python -m aieng.agent_evals.report_generation.main
```

The agent will be available through a [Gradio](https://www.gradio.app/) web UI under the
local address http://127.0.0.1:7860, which can be accessed on your preferred browser.

On the UI, there will be a few examples of requests you can make to this agent. It also
features a text input so you can make your own report requests to it.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
CREATE TABLE IF NOT EXISTS "sales" (
"InvoiceNo" INTEGER,
"StockCode" TEXT,
"Description" TEXT,
"Quantity" INTEGER,
"InvoiceDate" TEXT,
"UnitPrice" REAL,
"CustomerID" INTEGER,
"Country" TEXT
);
Loading