From b6f661e59824a7270e491e490972cc5069d5a31c Mon Sep 17 00:00:00 2001 From: sbalk Date: Mon, 28 Jul 2025 03:48:32 +0200 Subject: [PATCH 1/2] Add piper server support for OHF piper-tts --- .gitignore | 4 ++ README.md | 28 ++++++++-- agent_cli/agents/_voice_agent_common.py | 2 + agent_cli/agents/assistant.py | 19 +++++++ agent_cli/agents/autocorrect.py | 2 +- agent_cli/agents/chat.py | 21 +++++++ agent_cli/agents/speak.py | 20 +++++++ agent_cli/agents/transcribe.py | 2 +- agent_cli/agents/voice_edit.py | 19 +++++++ agent_cli/config.py | 14 ++++- agent_cli/opts.py | 44 +++++++++++++++ agent_cli/services/tts.py | 56 +++++++++++++++++++ docs/installation/linux.md | 2 +- docs/installation/macos.md | 2 +- example.agent-cli-config.toml | 4 +- pyproject.toml | 1 + scripts/run-piper-server.sh | 18 ++++++ .../{run-piper.sh => run-piper-wyoming.sh} | 0 scripts/setup-linux.sh | 2 +- scripts/setup-macos.sh | 2 +- scripts/start-all-services.sh | 2 +- tests/agents/test_fix_my_text.py | 10 ++-- tests/agents/test_interactive.py | 17 +++++- tests/agents/test_interactive_extra.py | 18 +++++- tests/agents/test_speak.py | 12 +++- tests/agents/test_speak_e2e.py | 4 ++ tests/agents/test_transcribe.py | 6 +- tests/agents/test_transcribe_e2e.py | 2 +- tests/agents/test_tts_common.py | 37 +++++++++++- tests/agents/test_tts_common_extra.py | 12 +++- tests/agents/test_voice_agent_common.py | 10 +++- tests/agents/test_voice_edit_e2e.py | 10 +++- tests/test_llm.py | 12 ++-- tests/test_llm_gemini.py | 2 +- tests/test_services.py | 10 +++- tests/test_tts.py | 12 +++- 36 files changed, 390 insertions(+), 48 deletions(-) create mode 100755 scripts/run-piper-server.sh rename scripts/{run-piper.sh => run-piper-wyoming.sh} (100%) diff --git a/.gitignore b/.gitignore index 0b42d388..e4a486a2 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ coverage.* .pytest_cache/ .vscode/ .vscode/ +.DS_Store # Examples and scripts - exclude downloaded models and data examples/ollama/models/ @@ -67,3 +68,6 @@ scripts/.runtime/ *.onnx *.onnx.json *.bin + +# Config file +agent-cli-config.toml diff --git a/README.md b/README.md index 31b8032e..7c28dddc 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ I use it mostly for the `transcribe` function when working with LLMs. Being able - **`autocorrect`**: Correct grammar and spelling in your text (e.g., from clipboard) using a local LLM with Ollama or OpenAI. - **`transcribe`**: Transcribe audio from your microphone to text in your clipboard using a local Whisper model or OpenAI's Whisper API. -- **`speak`**: Convert text to speech using a local TTS engine or OpenAI's TTS API. +- **`speak`**: Convert text to speech using Piper HTTP server, Wyoming TTS, OpenAI, or Kokoro TTS. - **`voice-edit`**: A voice-powered clipboard assistant that edits text based on your spoken commands. - **`assistant`**: A hands-free voice assistant that starts and stops recording based on a wake word. - **`chat`**: A conversational AI agent with tool-calling capabilities. @@ -258,7 +258,13 @@ Our installation scripts automatically handle all dependencies: |---------|---------|-----------------| | **[Ollama](https://ollama.ai/)** | Local LLM for text processing | ✅ Yes, with default model | | **[Wyoming Faster Whisper](https://github.com/rhasspy/wyoming-faster-whisper)** | Speech-to-text | ✅ Yes, via `uvx` | -| **[Wyoming Piper](https://github.com/rhasspy/wyoming-piper)** | Text-to-speech | ✅ Yes, via `uvx` | +| **[Piper TTS](https://github.com/rhasspy/piper)** | Text-to-speech (HTTP server) | ✅ Yes, via `uvx` | +| **[Wyoming Piper](https://github.com/rhasspy/wyoming-piper)** | Text-to-speech (Wyoming protocol) | ⚙️ Alternative to HTTP server | + +> [!NOTE] +> **TTS Provider Update**: The default Piper TTS now uses HTTP server mode for better performance. +> Scripts have been renamed: `run-piper.sh` → `run-piper-wyoming.sh`, `run-piper2.sh` → `run-piper-server.sh`. +> Use `--tts-provider piper` for the new HTTP server, or `--tts-provider local` for Wyoming protocol. | **[Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI)** | Premium TTS (optional) | ⚙️ Can be added later | | **[Wyoming openWakeWord](https://github.com/rhasspy/wyoming-openwakeword)** | Wake word detection | ✅ Yes, for `assistant` | @@ -531,14 +537,28 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett **Workflow:** A straightforward text-to-speech utility. 1. It takes text from a command-line argument or your clipboard. -2. It sends the text to a Wyoming TTS server (like Piper). +2. It sends the text to a TTS server (Piper HTTP server, Wyoming TTS, OpenAI, or Kokoro). 3. The generated audio is played through your default speakers. +**TTS Provider Options:** + +- **Piper HTTP Server** (default local): Fast, high-quality TTS via HTTP + - Start server: `./scripts/run-piper-server.sh` + - Use: `agent-cli speak --tts-provider piper "Hello, world!"` +- **Wyoming Piper**: Alternative Wyoming protocol interface + - Start server: `./scripts/run-piper-wyoming.sh` + - Use: `agent-cli speak --tts-provider local "Hello, world!"` +- **OpenAI**: Cloud-based TTS (requires API key) + - Use: `agent-cli speak --tts-provider openai "Hello, world!"` +- **Kokoro**: High-quality local TTS (optional setup) + - Use: `agent-cli speak --tts-provider kokoro "Hello, world!"` + **How to Use It:** - **Speak from Argument**: `agent-cli speak "Hello, world!"` - **Speak from Clipboard**: `agent-cli speak` - **Save to File**: `agent-cli speak "Hello" --save-file hello.wav` +- **With Piper HTTP**: `agent-cli speak --tts-provider piper "Hello"`
See the output of agent-cli speak --help @@ -558,7 +578,7 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett Usage: agent-cli speak [OPTIONS] [TEXT] - Convert text to speech using Wyoming or OpenAI TTS server. + Convert text to speech using Piper, Wyoming, OpenAI, or Kokoro TTS server. ╭─ General Options ────────────────────────────────────────────────────────────╮ diff --git a/agent_cli/agents/_voice_agent_common.py b/agent_cli/agents/_voice_agent_common.py index ac304062..d1d9a377 100644 --- a/agent_cli/agents/_voice_agent_common.py +++ b/agent_cli/agents/_voice_agent_common.py @@ -86,6 +86,7 @@ async def process_instruction_and_respond( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, system_prompt: str, agent_instructions: str, live: Live | None, @@ -120,6 +121,7 @@ async def process_instruction_and_respond( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=general_cfg.save_file, quiet=general_cfg.quiet, logger=logger, diff --git a/agent_cli/agents/assistant.py b/agent_cli/agents/assistant.py index 3c5c48cd..b810ce7c 100644 --- a/agent_cli/agents/assistant.py +++ b/agent_cli/agents/assistant.py @@ -178,6 +178,7 @@ async def _async_main( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, wake_word_cfg: config.WakeWord, system_prompt: str, agent_instructions: str, @@ -240,6 +241,7 @@ async def _async_main( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, system_prompt=system_prompt, agent_instructions=agent_instructions, live=live, @@ -289,6 +291,13 @@ def assistant( tts_kokoro_model: str = opts.TTS_KOKORO_MODEL, tts_kokoro_voice: str = opts.TTS_KOKORO_VOICE, tts_kokoro_host: str = opts.TTS_KOKORO_HOST, + tts_piper_host: str = opts.TTS_PIPER_HOST, + tts_piper_voice: str | None = opts.TTS_PIPER_VOICE, + tts_piper_speaker: str | None = opts.TTS_PIPER_SPEAKER, + tts_piper_speaker_id: int | None = opts.TTS_PIPER_SPEAKER_ID, + tts_piper_length_scale: float = opts.TTS_PIPER_LENGTH_SCALE, + tts_piper_noise_scale: float | None = opts.TTS_PIPER_NOISE_SCALE, + tts_piper_noise_w_scale: float | None = opts.TTS_PIPER_NOISE_W_SCALE, # --- Process Management --- stop: bool = opts.STOP, status: bool = opts.STATUS, @@ -383,6 +392,15 @@ def assistant( tts_kokoro_voice=tts_kokoro_voice, tts_kokoro_host=tts_kokoro_host, ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host=tts_piper_host, + tts_piper_voice=tts_piper_voice, + tts_piper_speaker=tts_piper_speaker, + tts_piper_speaker_id=tts_piper_speaker_id, + tts_piper_length_scale=tts_piper_length_scale, + tts_piper_noise_scale=tts_piper_noise_scale, + tts_piper_noise_w_scale=tts_piper_noise_w_scale, + ) wake_word_cfg = config.WakeWord( wake_server_ip=wake_server_ip, wake_server_port=wake_server_port, @@ -413,6 +431,7 @@ def assistant( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, wake_word_cfg=wake_word_cfg, system_prompt=system_prompt, agent_instructions=agent_instructions, diff --git a/agent_cli/agents/autocorrect.py b/agent_cli/agents/autocorrect.py index 88c12692..ca2b248e 100644 --- a/agent_cli/agents/autocorrect.py +++ b/agent_cli/agents/autocorrect.py @@ -229,7 +229,7 @@ def autocorrect( provider_cfg = config.ProviderSelection( llm_provider=llm_provider, asr_provider="local", # Not used, but required by model - tts_provider="local", # Not used, but required by model + tts_provider="piper", # Not used, but required by model ) ollama_cfg = config.Ollama(llm_ollama_model=llm_ollama_model, llm_ollama_host=llm_ollama_host) openai_llm_cfg = config.OpenAILLM( diff --git a/agent_cli/agents/chat.py b/agent_cli/agents/chat.py index 90c8ef53..60ae4345 100644 --- a/agent_cli/agents/chat.py +++ b/agent_cli/agents/chat.py @@ -163,6 +163,7 @@ async def _handle_conversation_turn( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, live: Live, ) -> None: """Handles a single turn of the conversation.""" @@ -285,6 +286,7 @@ async def _handle_conversation_turn( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=general_cfg.save_file, quiet=general_cfg.quiet, logger=LOGGER, @@ -315,6 +317,7 @@ async def _async_main( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, ) -> None: """Main async function, consumes parsed arguments.""" try: @@ -362,6 +365,7 @@ async def _async_main( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, live=live, ) except Exception: @@ -405,6 +409,13 @@ def chat( tts_kokoro_model: str = opts.TTS_KOKORO_MODEL, tts_kokoro_voice: str = opts.TTS_KOKORO_VOICE, tts_kokoro_host: str = opts.TTS_KOKORO_HOST, + tts_piper_host: str = opts.TTS_PIPER_HOST, + tts_piper_voice: str | None = opts.TTS_PIPER_VOICE, + tts_piper_speaker: str | None = opts.TTS_PIPER_SPEAKER, + tts_piper_speaker_id: int | None = opts.TTS_PIPER_SPEAKER_ID, + tts_piper_length_scale: float = opts.TTS_PIPER_LENGTH_SCALE, + tts_piper_noise_scale: float | None = opts.TTS_PIPER_NOISE_SCALE, + tts_piper_noise_w_scale: float | None = opts.TTS_PIPER_NOISE_W_SCALE, # --- Process Management --- stop: bool = opts.STOP, status: bool = opts.STATUS, @@ -508,6 +519,15 @@ def chat( tts_kokoro_voice=tts_kokoro_voice, tts_kokoro_host=tts_kokoro_host, ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host=tts_piper_host, + tts_piper_voice=tts_piper_voice, + tts_piper_speaker=tts_piper_speaker, + tts_piper_speaker_id=tts_piper_speaker_id, + tts_piper_length_scale=tts_piper_length_scale, + tts_piper_noise_scale=tts_piper_noise_scale, + tts_piper_noise_w_scale=tts_piper_noise_w_scale, + ) history_cfg = config.History( history_dir=history_dir, last_n_messages=last_n_messages, @@ -528,5 +548,6 @@ def chat( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ), ) diff --git a/agent_cli/agents/speak.py b/agent_cli/agents/speak.py index f803fa6f..f0e64266 100644 --- a/agent_cli/agents/speak.py +++ b/agent_cli/agents/speak.py @@ -35,6 +35,7 @@ async def _async_main( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, ) -> None: """Async entry point for the speak command.""" with pyaudio_context() as p: @@ -64,6 +65,7 @@ async def _async_main( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=general_cfg.save_file, quiet=general_cfg.quiet, logger=LOGGER, @@ -102,6 +104,14 @@ def speak( tts_kokoro_model: str = opts.TTS_KOKORO_MODEL, tts_kokoro_voice: str = opts.TTS_KOKORO_VOICE, tts_kokoro_host: str = opts.TTS_KOKORO_HOST, + # Piper + tts_piper_host: str = opts.TTS_PIPER_HOST, + tts_piper_voice: str | None = opts.TTS_PIPER_VOICE, + tts_piper_speaker: str | None = opts.TTS_PIPER_SPEAKER, + tts_piper_speaker_id: int | None = opts.TTS_PIPER_SPEAKER_ID, + tts_piper_length_scale: float = opts.TTS_PIPER_LENGTH_SCALE, + tts_piper_noise_scale: float | None = opts.TTS_PIPER_NOISE_SCALE, + tts_piper_noise_w_scale: float | None = opts.TTS_PIPER_NOISE_W_SCALE, # --- General Options --- list_devices: bool = opts.LIST_DEVICES, save_file: Path | None = opts.SAVE_FILE, @@ -165,6 +175,15 @@ def speak( tts_kokoro_voice=tts_kokoro_voice, tts_kokoro_host=tts_kokoro_host, ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host=tts_piper_host, + tts_piper_voice=tts_piper_voice, + tts_piper_speaker=tts_piper_speaker, + tts_piper_speaker_id=tts_piper_speaker_id, + tts_piper_length_scale=tts_piper_length_scale, + tts_piper_noise_scale=tts_piper_noise_scale, + tts_piper_noise_w_scale=tts_piper_noise_w_scale, + ) asyncio.run( _async_main( @@ -175,5 +194,6 @@ def speak( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ), ) diff --git a/agent_cli/agents/transcribe.py b/agent_cli/agents/transcribe.py index 0d89f62b..134f7528 100644 --- a/agent_cli/agents/transcribe.py +++ b/agent_cli/agents/transcribe.py @@ -293,7 +293,7 @@ def transcribe( provider_cfg = config.ProviderSelection( asr_provider=asr_provider, llm_provider=llm_provider, - tts_provider="local", # Not used + tts_provider="piper", # Not used ) audio_in_cfg = config.AudioInput( input_device_index=input_device_index, diff --git a/agent_cli/agents/voice_edit.py b/agent_cli/agents/voice_edit.py index 0338d235..79e70649 100644 --- a/agent_cli/agents/voice_edit.py +++ b/agent_cli/agents/voice_edit.py @@ -101,6 +101,7 @@ async def _async_main( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, ) -> None: """Core asynchronous logic for the voice assistant.""" with pyaudio_context() as p: @@ -161,6 +162,7 @@ async def _async_main( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, system_prompt=SYSTEM_PROMPT, agent_instructions=AGENT_INSTRUCTIONS, live=live, @@ -203,6 +205,13 @@ def voice_edit( tts_kokoro_model: str = opts.TTS_KOKORO_MODEL, tts_kokoro_voice: str = opts.TTS_KOKORO_VOICE, tts_kokoro_host: str = opts.TTS_KOKORO_HOST, + tts_piper_host: str = opts.TTS_PIPER_HOST, + tts_piper_voice: str | None = opts.TTS_PIPER_VOICE, + tts_piper_speaker: str | None = opts.TTS_PIPER_SPEAKER, + tts_piper_speaker_id: int | None = opts.TTS_PIPER_SPEAKER_ID, + tts_piper_length_scale: float = opts.TTS_PIPER_LENGTH_SCALE, + tts_piper_noise_scale: float | None = opts.TTS_PIPER_NOISE_SCALE, + tts_piper_noise_w_scale: float | None = opts.TTS_PIPER_NOISE_W_SCALE, # --- Process Management --- stop: bool = opts.STOP, status: bool = opts.STATUS, @@ -302,6 +311,15 @@ def voice_edit( tts_kokoro_voice=tts_kokoro_voice, tts_kokoro_host=tts_kokoro_host, ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host=tts_piper_host, + tts_piper_voice=tts_piper_voice, + tts_piper_speaker=tts_piper_speaker, + tts_piper_speaker_id=tts_piper_speaker_id, + tts_piper_length_scale=tts_piper_length_scale, + tts_piper_noise_scale=tts_piper_noise_scale, + tts_piper_noise_w_scale=tts_piper_noise_w_scale, + ) asyncio.run( _async_main( @@ -317,5 +335,6 @@ def voice_edit( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ), ) diff --git a/agent_cli/config.py b/agent_cli/config.py index 120ce22d..e8913849 100644 --- a/agent_cli/config.py +++ b/agent_cli/config.py @@ -23,7 +23,7 @@ class ProviderSelection(BaseModel): llm_provider: Literal["local", "openai", "gemini"] asr_provider: Literal["local", "openai"] - tts_provider: Literal["local", "openai", "kokoro"] + tts_provider: Literal["local", "openai", "kokoro", "piper"] # --- Panel: LLM Configuration --- @@ -112,6 +112,18 @@ class KokoroTTS(BaseModel): tts_kokoro_host: str +class PiperTTS(BaseModel): + """Configuration for the Piper HTTP TTS provider.""" + + tts_piper_host: str + tts_piper_voice: str | None = None + tts_piper_speaker: str | None = None + tts_piper_speaker_id: int | None = None + tts_piper_length_scale: float = 1.0 + tts_piper_noise_scale: float | None = None + tts_piper_noise_w_scale: float | None = None + + # --- Panel: Wake Word Options --- diff --git a/agent_cli/opts.py b/agent_cli/opts.py index 29d67770..d18901f5 100644 --- a/agent_cli/opts.py +++ b/agent_cli/opts.py @@ -230,6 +230,50 @@ rich_help_panel="TTS (Text-to-Speech) Configuration: Kokoro", ) +# --- TTS Configuration: Piper --- +TTS_PIPER_HOST: str = typer.Option( + "http://localhost:10200", + "--tts-piper-host", + help="The base URL for the Piper HTTP server.", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) +TTS_PIPER_VOICE: str | None = typer.Option( + None, + "--tts-piper-voice", + help="The voice to use for Piper TTS (optional).", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) +TTS_PIPER_SPEAKER: str | None = typer.Option( + None, + "--tts-piper-speaker", + help="The speaker to use for multi-speaker voices (optional).", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) +TTS_PIPER_SPEAKER_ID: int | None = typer.Option( + None, + "--tts-piper-speaker-id", + help="The speaker ID to use for multi-speaker voices (optional, overrides speaker).", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) +TTS_PIPER_LENGTH_SCALE: float = typer.Option( + 1.0, + "--tts-piper-length-scale", + help="Speaking speed (1.0 = normal speed).", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) +TTS_PIPER_NOISE_SCALE: float | None = typer.Option( + None, + "--tts-piper-noise-scale", + help="Speaking variability (optional).", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) +TTS_PIPER_NOISE_W_SCALE: float | None = typer.Option( + None, + "--tts-piper-noise-w-scale", + help="Phoneme width variability (optional).", + rich_help_panel="TTS (Text-to-Speech) Configuration: Piper", +) + # --- Process Management Options --- STOP: bool = typer.Option( diff --git a/agent_cli/services/tts.py b/agent_cli/services/tts.py index 2418e285..f849eab2 100644 --- a/agent_cli/services/tts.py +++ b/agent_cli/services/tts.py @@ -7,9 +7,11 @@ import io import wave from functools import partial +from http import HTTPStatus from pathlib import Path from typing import TYPE_CHECKING +import aiohttp from openai import AsyncOpenAI from rich.live import Live from wyoming.audio import AudioChunk, AudioStart, AudioStop @@ -45,6 +47,7 @@ def create_synthesizer( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, ) -> Callable[..., Awaitable[bytes | None]]: """Return the appropriate synthesizer based on the config.""" if not audio_output_cfg.enable_tts: @@ -59,6 +62,11 @@ def create_synthesizer( _synthesize_speech_kokoro, kokoro_tts_cfg=kokoro_tts_cfg, ) + if provider_cfg.tts_provider == "piper": + return partial( + _synthesize_speech_piper, + piper_tts_cfg=piper_tts_cfg, + ) return partial(_synthesize_speech_wyoming, wyoming_tts_cfg=wyoming_tts_cfg) @@ -70,6 +78,7 @@ async def handle_tts_playback( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, save_file: Path | None, quiet: bool, logger: logging.Logger, @@ -91,6 +100,7 @@ async def handle_tts_playback( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, logger=logger, quiet=quiet, play_audio_flag=play_audio, @@ -243,6 +253,49 @@ async def _synthesize_speech_kokoro( return None +async def _synthesize_speech_piper( + *, + text: str, + piper_tts_cfg: config.PiperTTS, + logger: logging.Logger, + **_kwargs: object, +) -> bytes | None: + """Synthesize speech from text using Piper HTTP server.""" + try: + payload: dict[str, str | int | float] = {"text": text} + + if piper_tts_cfg.tts_piper_voice: + payload["voice"] = piper_tts_cfg.tts_piper_voice + if piper_tts_cfg.tts_piper_speaker: + payload["speaker"] = piper_tts_cfg.tts_piper_speaker + if piper_tts_cfg.tts_piper_speaker_id is not None: + payload["speaker_id"] = piper_tts_cfg.tts_piper_speaker_id + if piper_tts_cfg.tts_piper_length_scale != 1.0: + payload["length_scale"] = piper_tts_cfg.tts_piper_length_scale + if piper_tts_cfg.tts_piper_noise_scale is not None: + payload["noise_scale"] = piper_tts_cfg.tts_piper_noise_scale + if piper_tts_cfg.tts_piper_noise_w_scale is not None: + payload["noise_w_scale"] = piper_tts_cfg.tts_piper_noise_w_scale + + async with ( + aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session, + session.post( + piper_tts_cfg.tts_piper_host, + json=payload, + headers={"Content-Type": "application/json"}, + ) as response, + ): + if response.status == HTTPStatus.OK: + audio_data = await response.read() + logger.info("Piper speech synthesis completed: %d bytes", len(audio_data)) + return audio_data + logger.error("Piper HTTP error: %d - %s", response.status, await response.text()) + return None + except Exception: + logger.exception("Error during Piper speech synthesis") + return None + + async def _synthesize_speech_wyoming( *, text: str, @@ -362,6 +415,7 @@ async def _speak_text( wyoming_tts_cfg: config.WyomingTTS, openai_tts_cfg: config.OpenAITTS, kokoro_tts_cfg: config.KokoroTTS, + piper_tts_cfg: config.PiperTTS, logger: logging.Logger, quiet: bool = False, play_audio_flag: bool = True, @@ -375,6 +429,7 @@ async def _speak_text( wyoming_tts_cfg, openai_tts_cfg, kokoro_tts_cfg, + piper_tts_cfg, ) audio_data = None try: @@ -384,6 +439,7 @@ async def _speak_text( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, logger=logger, quiet=quiet, live=live, diff --git a/docs/installation/linux.md b/docs/installation/linux.md index 410da7df..098a2cf9 100644 --- a/docs/installation/linux.md +++ b/docs/installation/linux.md @@ -66,7 +66,7 @@ ollama serve scripts/run-whisper.sh # Terminal 3: Piper -scripts/run-piper.sh +scripts/run-piper-server.sh # Terminal 4: OpenWakeWord scripts/run-openwakeword.sh diff --git a/docs/installation/macos.md b/docs/installation/macos.md index 16a80cb6..2220a879 100644 --- a/docs/installation/macos.md +++ b/docs/installation/macos.md @@ -88,7 +88,7 @@ ollama serve scripts/run-whisper.sh # Terminal 3: Piper (Apple Silicon compatible) -scripts/run-piper.sh +scripts/run-piper-server.sh # Terminal 4: OpenWakeWord (macOS compatible fork) scripts/run-openwakeword.sh diff --git a/example.agent-cli-config.toml b/example.agent-cli-config.toml index 7887acb6..cc6cee5d 100644 --- a/example.agent-cli-config.toml +++ b/example.agent-cli-config.toml @@ -16,7 +16,7 @@ # Select the default provider for each service ("local" or "openai"). llm-provider = "local" asr-provider = "local" -tts-provider = "local" +tts-provider = "piper" # --- API Keys --- # Your OpenAI API key. Can also be set via the OPENAI_API_KEY environment variable. @@ -97,7 +97,7 @@ last-n-messages = 50 # Number of messages to load from history [speak] # Use a specific voice for the speak command. -tts-provider = "local" +tts-provider = "piper" tts-wyoming-voice = "en_US-ryan-high" tts-speed = 1.0 diff --git a/pyproject.toml b/pyproject.toml index ebb0c5d7..f4c29270 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "openai", "dotenv", "google-genai>=1.25.0", + "aiohttp", ] requires-python = ">=3.11" diff --git a/scripts/run-piper-server.sh b/scripts/run-piper-server.sh new file mode 100755 index 00000000..747424d4 --- /dev/null +++ b/scripts/run-piper-server.sh @@ -0,0 +1,18 @@ +#!/bin/bash +echo "🔊 Starting Piper HTTP server on port 10200..." + +# Create .runtime directory +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +mkdir -p "$SCRIPT_DIR/.runtime" + +# Download voice if not present using uvx +if [ ! -d "$SCRIPT_DIR/.runtime/piper-data/en_US-lessac-medium" ]; then + echo "⬇️ Downloading voice model..." + mkdir -p "$SCRIPT_DIR/.runtime/piper-data" + cd "$SCRIPT_DIR/.runtime/piper-data" + uvx --from piper-tts python -m piper.download_voices en_US-lessac-medium + cd "$SCRIPT_DIR" +fi + +# Run Piper HTTP server using uvx with Flask dependency +uvx --with flask --from piper-tts python -m piper.http_server -m $SCRIPT_DIR/.runtime/piper-data/en_US-lessac-medium --port 10200 diff --git a/scripts/run-piper.sh b/scripts/run-piper-wyoming.sh similarity index 100% rename from scripts/run-piper.sh rename to scripts/run-piper-wyoming.sh diff --git a/scripts/setup-linux.sh b/scripts/setup-linux.sh index d26762ae..2a74bc0f 100755 --- a/scripts/setup-linux.sh +++ b/scripts/setup-linux.sh @@ -100,7 +100,7 @@ echo "" echo "Option 2 - Run services individually:" echo " 1. Ollama: ollama serve" echo " 2. Whisper: scripts/run-whisper.sh" -echo " 3. Piper: scripts/run-piper.sh" +echo " 3. Piper: scripts/run-piper-server.sh" echo " 4. OpenWakeWord: scripts/run-openwakeword.sh" echo "" echo "📝 Note: Services use uvx to run without needing virtual environments." diff --git a/scripts/setup-macos.sh b/scripts/setup-macos.sh index 2d20e886..974b6601 100755 --- a/scripts/setup-macos.sh +++ b/scripts/setup-macos.sh @@ -63,7 +63,7 @@ echo "" echo "Option 2 - Run services individually:" echo " 1. Ollama: ollama serve" echo " 2. Whisper: ./run-whisper.sh" -echo " 3. Piper: ./run-piper.sh" +echo " 3. Piper: ./run-piper-server.sh" echo " 4. OpenWakeWord: ./run-openwakeword.sh" echo "" echo "🎉 agent-cli has been installed and is ready to use!" diff --git a/scripts/start-all-services.sh b/scripts/start-all-services.sh index e11fc5a8..e679eede 100755 --- a/scripts/start-all-services.sh +++ b/scripts/start-all-services.sh @@ -40,7 +40,7 @@ layout { pane { name "Piper" cwd "$SCRIPTS_DIR" - command "./run-piper.sh" + command "./run-piper-server.sh" } pane { name "OpenWakeWord" diff --git a/tests/agents/test_fix_my_text.py b/tests/agents/test_fix_my_text.py index 83cb628b..74d7610e 100644 --- a/tests/agents/test_fix_my_text.py +++ b/tests/agents/test_fix_my_text.py @@ -111,7 +111,7 @@ async def test_process_text_integration(mock_create_llm_agent: MagicMock) -> Non provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test-model", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) @@ -166,7 +166,7 @@ async def test_autocorrect_command_with_text( provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama( llm_ollama_model="qwen3:4b", @@ -227,7 +227,7 @@ async def test_autocorrect_command_from_clipboard( provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama( llm_ollama_model="qwen3:4b", @@ -280,7 +280,7 @@ async def test_async_autocorrect_no_text( provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) @@ -313,7 +313,7 @@ async def test_async_autocorrect_error(mock_process_text: AsyncMock): provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) diff --git a/tests/agents/test_interactive.py b/tests/agents/test_interactive.py index 1fb022d1..da77a893 100644 --- a/tests/agents/test_interactive.py +++ b/tests/agents/test_interactive.py @@ -88,7 +88,7 @@ async def test_async_main_list_devices(tmp_path: Path) -> None: provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) history_cfg = config.History(history_dir=tmp_path) audio_in_cfg = config.AudioInput() @@ -108,6 +108,9 @@ async def test_async_main_list_devices(tmp_path: Path) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.chat.pyaudio_context"), @@ -130,6 +133,7 @@ async def test_async_main_list_devices(tmp_path: Path) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) mock_setup_devices.assert_called_once() @@ -167,6 +171,9 @@ async def test_async_main_list_output_devices(tmp_path: Path) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.chat.pyaudio_context"), @@ -189,6 +196,7 @@ async def test_async_main_list_output_devices(tmp_path: Path) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) mock_setup_devices.assert_called_once() @@ -204,7 +212,7 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: log_file=None, list_devices=False, quiet=False, - clipboard=False, + tts_provider="piper", ) provider_cfg = config.ProviderSelection( asr_provider="local", @@ -233,6 +241,9 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.chat.pyaudio_context"), @@ -272,6 +283,7 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) # Verify that the core functions were called @@ -286,6 +298,7 @@ async def test_async_main_full_loop(tmp_path: Path) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=None, quiet=False, logger=mock_tts.call_args.kwargs["logger"], diff --git a/tests/agents/test_interactive_extra.py b/tests/agents/test_interactive_extra.py index 18b5b83a..9e37cfc1 100644 --- a/tests/agents/test_interactive_extra.py +++ b/tests/agents/test_interactive_extra.py @@ -24,7 +24,7 @@ async def test_handle_conversation_turn_no_llm_response(): provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) history_cfg = config.History() audio_in_cfg = config.AudioInput() @@ -44,6 +44,9 @@ async def test_handle_conversation_turn_no_llm_response(): tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) mock_live = MagicMock() with ( @@ -73,6 +76,7 @@ async def test_handle_conversation_turn_no_llm_response(): wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, live=mock_live, ) mock_create_transcriber.assert_called_once() @@ -92,7 +96,7 @@ async def test_handle_conversation_turn_no_instruction(): provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) history_cfg = config.History() audio_in_cfg = config.AudioInput() @@ -112,6 +116,9 @@ async def test_handle_conversation_turn_no_instruction(): tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) mock_live = MagicMock() with patch("agent_cli.agents.chat.asr.create_transcriber") as mock_create_transcriber: @@ -134,6 +141,7 @@ async def test_handle_conversation_turn_no_instruction(): wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, live=mock_live, ) mock_create_transcriber.assert_called_once() @@ -192,7 +200,7 @@ async def test_async_main_exception_handling(): provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) history_cfg = config.History() audio_in_cfg = config.AudioInput() @@ -212,6 +220,9 @@ async def test_async_main_exception_handling(): tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.chat.pyaudio_context", side_effect=Exception("Test error")), @@ -232,5 +243,6 @@ async def test_async_main_exception_handling(): wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) mock_console.print_exception.assert_called_once() diff --git a/tests/agents/test_speak.py b/tests/agents/test_speak.py index cb250320..2cf200cc 100644 --- a/tests/agents/test_speak.py +++ b/tests/agents/test_speak.py @@ -19,7 +19,7 @@ async def test_async_main_with_text(): """Test the _async_main function with text provided.""" general_cfg = config.General(log_level="INFO", quiet=True) provider_cfg = config.ProviderSelection( - tts_provider="local", + tts_provider="piper", llm_provider="local", asr_provider="local", ) @@ -31,6 +31,9 @@ async def test_async_main_with_text(): tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.speak.pyaudio_context"), @@ -51,6 +54,7 @@ async def test_async_main_with_text(): wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) mock_handle_tts.assert_called_once() @@ -60,7 +64,7 @@ async def test_async_main_no_devices(): """Test the _async_main function when no devices are found.""" general_cfg = config.General(log_level="INFO", quiet=True) provider_cfg = config.ProviderSelection( - tts_provider="local", + tts_provider="piper", llm_provider="local", asr_provider="local", ) @@ -72,6 +76,9 @@ async def test_async_main_no_devices(): tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.speak.pyaudio_context"), @@ -92,6 +99,7 @@ async def test_async_main_no_devices(): wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) mock_setup.assert_called_once() mock_handle_tts.assert_not_called() diff --git a/tests/agents/test_speak_e2e.py b/tests/agents/test_speak_e2e.py index eb59ac31..34c7f6d2 100644 --- a/tests/agents/test_speak_e2e.py +++ b/tests/agents/test_speak_e2e.py @@ -60,6 +60,9 @@ async def test_speak_e2e( tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) await _async_main( general_cfg=general_cfg, @@ -69,6 +72,7 @@ async def test_speak_e2e( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) # Verify that the audio was "played" diff --git a/tests/agents/test_transcribe.py b/tests/agents/test_transcribe.py index 2dc4fe25..8c214c77 100644 --- a/tests/agents/test_transcribe.py +++ b/tests/agents/test_transcribe.py @@ -48,7 +48,7 @@ async def test_transcribe_main_llm_enabled( provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) general_cfg = config.General( log_level="INFO", @@ -118,7 +118,7 @@ async def test_transcribe_main( provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) general_cfg = config.General( log_level="INFO", @@ -241,7 +241,7 @@ async def test_transcribe_with_logging( provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) general_cfg = config.General( log_level="INFO", diff --git a/tests/agents/test_transcribe_e2e.py b/tests/agents/test_transcribe_e2e.py index 7f4c9121..bb2d23f5 100644 --- a/tests/agents/test_transcribe_e2e.py +++ b/tests/agents/test_transcribe_e2e.py @@ -46,7 +46,7 @@ async def test_transcribe_e2e( provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) general_cfg = config.General( log_level="INFO", diff --git a/tests/agents/test_tts_common.py b/tests/agents/test_tts_common.py index 919b0a27..e601f829 100644 --- a/tests/agents/test_tts_common.py +++ b/tests/agents/test_tts_common.py @@ -21,7 +21,7 @@ async def test_handle_tts_playback(mock_speak_text: AsyncMock) -> None: mock_speak_text.return_value = b"audio data" mock_live = MagicMock() provider_cfg = config.ProviderSelection( - tts_provider="local", + tts_provider="piper", asr_provider="local", llm_provider="local", ) @@ -37,6 +37,15 @@ async def test_handle_tts_playback(mock_speak_text: AsyncMock) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + tts_piper_voice=None, + tts_piper_speaker=None, + tts_piper_speaker_id=None, + tts_piper_length_scale=1.0, + tts_piper_noise_scale=None, + tts_piper_noise_w_scale=None, + ) await handle_tts_playback( text="hello", @@ -45,6 +54,7 @@ async def test_handle_tts_playback(mock_speak_text: AsyncMock) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=None, quiet=False, logger=MagicMock(), @@ -59,6 +69,7 @@ async def test_handle_tts_playback(mock_speak_text: AsyncMock) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, logger=mock_speak_text.call_args.kwargs["logger"], quiet=False, play_audio_flag=True, @@ -79,7 +90,7 @@ async def test_handle_tts_playback_with_save_file( mock_live = MagicMock() provider_cfg = config.ProviderSelection( - tts_provider="local", + tts_provider="piper", asr_provider="local", llm_provider="local", ) @@ -95,6 +106,15 @@ async def test_handle_tts_playback_with_save_file( tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + tts_piper_voice=None, + tts_piper_speaker=None, + tts_piper_speaker_id=None, + tts_piper_length_scale=1.0, + tts_piper_noise_scale=None, + tts_piper_noise_w_scale=None, + ) await handle_tts_playback( text="hello", @@ -103,6 +123,7 @@ async def test_handle_tts_playback_with_save_file( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=save_file, quiet=False, logger=MagicMock(), @@ -122,7 +143,7 @@ async def test_handle_tts_playback_no_audio(mock_speak_text: AsyncMock) -> None: mock_speak_text.return_value = None mock_live = MagicMock() provider_cfg = config.ProviderSelection( - tts_provider="local", + tts_provider="piper", asr_provider="local", llm_provider="local", ) @@ -138,6 +159,15 @@ async def test_handle_tts_playback_no_audio(mock_speak_text: AsyncMock) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + tts_piper_voice=None, + tts_piper_speaker=None, + tts_piper_speaker_id=None, + tts_piper_length_scale=1.0, + tts_piper_noise_scale=None, + tts_piper_noise_w_scale=None, + ) await handle_tts_playback( text="hello", @@ -146,6 +176,7 @@ async def test_handle_tts_playback_no_audio(mock_speak_text: AsyncMock) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=None, quiet=False, logger=MagicMock(), diff --git a/tests/agents/test_tts_common_extra.py b/tests/agents/test_tts_common_extra.py index d9739247..2ab35589 100644 --- a/tests/agents/test_tts_common_extra.py +++ b/tests/agents/test_tts_common_extra.py @@ -35,7 +35,7 @@ async def test_handle_tts_playback_os_error(mock_speak_text: AsyncMock) -> None: mock_live = MagicMock() provider_cfg = config.ProviderSelection( - tts_provider="local", + tts_provider="piper", asr_provider="local", llm_provider="local", ) @@ -47,6 +47,15 @@ async def test_handle_tts_playback_os_error(mock_speak_text: AsyncMock) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + tts_piper_voice=None, + tts_piper_speaker=None, + tts_piper_speaker_id=None, + tts_piper_length_scale=1.0, + tts_piper_noise_scale=None, + tts_piper_noise_w_scale=None, + ) result = await handle_tts_playback( text="hello", @@ -55,6 +64,7 @@ async def test_handle_tts_playback_os_error(mock_speak_text: AsyncMock) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, save_file=None, quiet=False, logger=MagicMock(), diff --git a/tests/agents/test_voice_agent_common.py b/tests/agents/test_voice_agent_common.py index 389305d0..aadc00d2 100644 --- a/tests/agents/test_voice_agent_common.py +++ b/tests/agents/test_voice_agent_common.py @@ -22,7 +22,7 @@ async def test_get_instruction_from_audio(mock_create_transcriber: MagicMock) -> provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) audio_in_cfg = config.AudioInput(input_device_index=1) wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=1234) @@ -53,7 +53,7 @@ async def test_get_instruction_from_audio_error(mock_create_transcriber: MagicMo provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) audio_in_cfg = config.AudioInput(input_device_index=1) wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=1234) @@ -92,7 +92,7 @@ async def test_process_instruction_and_respond( ) provider_cfg = config.ProviderSelection( llm_provider="local", - tts_provider="local", + tts_provider="piper", asr_provider="local", ) ollama_cfg = config.Ollama(llm_ollama_model="test-model", llm_ollama_host="localhost") @@ -113,6 +113,9 @@ async def test_process_instruction_and_respond( tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) with ( patch("agent_cli.agents.autocorrect.pyperclip.copy"), @@ -130,6 +133,7 @@ async def test_process_instruction_and_respond( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, system_prompt="system prompt", agent_instructions="agent instructions", live=MagicMock(), diff --git a/tests/agents/test_voice_edit_e2e.py b/tests/agents/test_voice_edit_e2e.py index 905d13cf..99dae3fa 100644 --- a/tests/agents/test_voice_edit_e2e.py +++ b/tests/agents/test_voice_edit_e2e.py @@ -28,12 +28,13 @@ def get_configs() -> tuple[ config.WyomingTTS, config.OpenAITTS, config.KokoroTTS, + config.PiperTTS, ]: """Get all the necessary configs for the e2e test.""" provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) general_cfg = config.General( log_level="INFO", @@ -63,6 +64,9 @@ def get_configs() -> tuple[ tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) return ( provider_cfg, general_cfg, @@ -76,6 +80,7 @@ def get_configs() -> tuple[ wyoming_tts_cfg, openai_tts_cfg, kokoro_tts_cfg, + piper_tts_cfg, ) @@ -116,6 +121,7 @@ async def test_voice_edit_e2e( wyoming_tts_cfg, openai_tts_cfg, kokoro_tts_cfg, + piper_tts_cfg, ) = get_configs() # This test focuses on the main loop, so we stop it after one run @@ -137,6 +143,7 @@ async def test_voice_edit_e2e( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) # Assertions @@ -164,6 +171,7 @@ async def test_voice_edit_e2e( wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, system_prompt=SYSTEM_PROMPT, agent_instructions=AGENT_INSTRUCTIONS, live=ANY, diff --git a/tests/test_llm.py b/tests/test_llm.py index 877a6653..13986495 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -16,7 +16,7 @@ def test_create_llm_agent_openai_no_key(): provider_cfg = config.ProviderSelection( llm_provider="openai", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama( llm_ollama_model="test-model", @@ -38,7 +38,7 @@ def test_create_llm_agent(monkeypatch: pytest.MonkeyPatch) -> None: provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama( llm_ollama_model="test-model", @@ -66,7 +66,7 @@ async def test_get_llm_response(mock_create_llm_agent: MagicMock) -> None: provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) @@ -103,7 +103,7 @@ async def test_get_llm_response_error(mock_create_llm_agent: MagicMock) -> None: provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) @@ -140,7 +140,7 @@ async def test_get_llm_response_error_exit(mock_create_llm_agent: MagicMock): provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) @@ -175,7 +175,7 @@ def test_process_and_update_clipboard( provider_cfg = config.ProviderSelection( llm_provider="local", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) ollama_cfg = config.Ollama(llm_ollama_model="test", llm_ollama_host="test") openai_llm_cfg = config.OpenAILLM(llm_openai_model="gpt-4o-mini", openai_api_key=None) diff --git a/tests/test_llm_gemini.py b/tests/test_llm_gemini.py index a406bb51..d7d68431 100644 --- a/tests/test_llm_gemini.py +++ b/tests/test_llm_gemini.py @@ -14,7 +14,7 @@ async def test_create_llm_agent_with_gemini() -> None: provider_cfg = config.ProviderSelection( llm_provider="gemini", asr_provider="local", - tts_provider="local", + tts_provider="piper", ) gemini_cfg = config.GeminiLLM( llm_gemini_model="gemini-1.5-flash", diff --git a/tests/test_services.py b/tests/test_services.py index f017c913..d81db3da 100644 --- a/tests/test_services.py +++ b/tests/test_services.py @@ -70,7 +70,7 @@ def test_create_transcriber_wyoming() -> None: provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) audio_input_cfg = config.AudioInput() wyoming_asr_cfg = config.WyomingASR(asr_wyoming_ip="localhost", asr_wyoming_port=1234) @@ -103,12 +103,16 @@ def test_create_synthesizer_wyoming() -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) synthesizer = tts.create_synthesizer( provider_cfg, audio_output_cfg, wyoming_tts_cfg, openai_tts_cfg, kokoro_tts_cfg, + piper_tts_cfg, ) assert synthesizer.func == tts._synthesize_speech_wyoming # type: ignore[attr-defined] @@ -131,12 +135,16 @@ def test_create_synthesizer_kokoro() -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) synthesizer = tts.create_synthesizer( provider_cfg, audio_output_cfg, wyoming_tts_cfg, openai_tts_cfg, kokoro_tts_cfg, + piper_tts_cfg, ) assert synthesizer.func == tts._synthesize_speech_kokoro # type: ignore[attr-defined] diff --git a/tests/test_tts.py b/tests/test_tts.py index 40668d95..293e085c 100644 --- a/tests/test_tts.py +++ b/tests/test_tts.py @@ -21,7 +21,7 @@ async def test_speak_text(mock_create_synthesizer: MagicMock) -> None: provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) audio_output_cfg = config.AudioOutput(enable_tts=True) wyoming_tts_cfg = config.WyomingTTS( @@ -34,6 +34,9 @@ async def test_speak_text(mock_create_synthesizer: MagicMock) -> None: tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) audio_data = await _speak_text( text="hello", @@ -42,6 +45,7 @@ async def test_speak_text(mock_create_synthesizer: MagicMock) -> None: wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, logger=MagicMock(), play_audio_flag=False, live=MagicMock(), @@ -123,7 +127,7 @@ def test_create_synthesizer_disabled(): provider_cfg = config.ProviderSelection( asr_provider="local", llm_provider="local", - tts_provider="local", + tts_provider="piper", ) audio_output_cfg = config.AudioOutput(enable_tts=False) wyoming_tts_cfg = config.WyomingTTS( @@ -136,6 +140,9 @@ def test_create_synthesizer_disabled(): tts_kokoro_voice="alloy", tts_kokoro_host="http://localhost:8000/v1", ) + piper_tts_cfg = config.PiperTTS( + tts_piper_host="http://localhost:5000", + ) synthesizer = create_synthesizer( provider_cfg=provider_cfg, @@ -143,6 +150,7 @@ def test_create_synthesizer_disabled(): wyoming_tts_cfg=wyoming_tts_cfg, openai_tts_cfg=openai_tts_cfg, kokoro_tts_cfg=kokoro_tts_cfg, + piper_tts_cfg=piper_tts_cfg, ) assert synthesizer.__name__ == "_dummy_synthesizer" From 614be146169eb0895ea9c6f2419eaa1833170913 Mon Sep 17 00:00:00 2001 From: Bas Nijholt Date: Wed, 30 Jul 2025 12:15:59 -0700 Subject: [PATCH 2/2] Update README --- README.md | 1208 ++++++++++++++++++++++++++--------------------------- 1 file changed, 598 insertions(+), 610 deletions(-) diff --git a/README.md b/README.md index b683f81b..9ea060b1 100644 --- a/README.md +++ b/README.md @@ -372,52 +372,49 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett Correct text from clipboard using a local or remote LLM. -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ text [TEXT] The text to correct. If not provided, reads from │ -│ clipboard. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Provider Selection ─────────────────────────────────────────────────────────╮ -│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, │ -│ 'openai', 'gemini'). │ -│ [default: local] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Ollama (local) ──────────────────────────────────────────╮ -│ --llm-ollama-model TEXT The Ollama model to use. Default is │ -│ qwen3:4b. │ -│ [default: qwen3:4b] │ -│ --llm-ollama-host TEXT The Ollama server host. Default is │ -│ http://localhost:11434. │ -│ [default: http://localhost:11434] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: OpenAI ──────────────────────────────────────────────────╮ -│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ -│ [default: gpt-4o-mini] │ -│ --openai-api-key TEXT Your OpenAI API key. Can also be set with │ -│ the OPENAI_API_KEY environment variable. │ -│ [env var: OPENAI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Gemini ──────────────────────────────────────────────────╮ -│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ -│ [default: gemini-2.5-flash] │ -│ --gemini-api-key TEXT Your Gemini API key. Can also be set with │ -│ the GEMINI_API_KEY environment variable. │ -│ [env var: GEMINI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ --log-level TEXT Set logging level. [default: WARNING] │ -│ --log-file TEXT Path to a file to write logs to. [default: None] │ -│ --quiet -q Suppress console output from rich. │ -│ --config TEXT Path to a TOML configuration file. │ -│ [default: None] │ -│ --print-args Print the command line arguments, including │ -│ variables taken from the configuration file. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ text [TEXT] The text to correct. If not provided, reads from clipboard. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ──────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Provider Selection ───────────────────────────────────────────────────────────────────╮ +│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, 'openai', │ +│ 'gemini'). │ +│ [default: local] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Ollama (local) ────────────────────────────────────────────────────╮ +│ --llm-ollama-model TEXT The Ollama model to use. Default is qwen3:4b. │ +│ [default: qwen3:4b] │ +│ --llm-ollama-host TEXT The Ollama server host. Default is │ +│ http://localhost:11434. │ +│ [default: http://localhost:11434] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: OpenAI ────────────────────────────────────────────────────────────╮ +│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ +│ [default: gpt-4o-mini] │ +│ --openai-api-key TEXT Your OpenAI API key. Can also be set with the │ +│ OPENAI_API_KEY environment variable. │ +│ [env var: OPENAI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Gemini ────────────────────────────────────────────────────────────╮ +│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ +│ [default: gemini-2.5-flash] │ +│ --gemini-api-key TEXT Your Gemini API key. Can also be set with the │ +│ GEMINI_API_KEY environment variable. │ +│ [env var: GEMINI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ --log-level TEXT Set logging level. [default: WARNING] │ +│ --log-file TEXT Path to a file to write logs to. [default: None] │ +│ --quiet -q Suppress console output from rich. │ +│ --config TEXT Path to a TOML configuration file. [default: None] │ +│ --print-args Print the command line arguments, including variables │ +│ taken from the configuration file. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -463,98 +460,84 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett Wyoming ASR Client for streaming microphone audio to a transcription server. -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --extra-instructions TEXT Additional instructions for the LLM to │ -│ process the transcription. │ -│ [default: None] │ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Provider Selection ─────────────────────────────────────────────────────────╮ -│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, │ -│ 'openai'). │ -│ [default: local] │ -│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, │ -│ 'openai', 'gemini'). │ -│ [default: local] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration ──────────────────────────────────────────────────╮ -│ --input-device-index INTEGER Index of the PyAudio input device to │ -│ use. │ -│ [default: None] │ -│ --input-device-name TEXT Device name keywords for partial │ -│ matching. │ -│ [default: None] │ -│ --list-devices List available audio input and output │ -│ devices and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: Wyoming (local) ─────────────────────────────────╮ -│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. │ -│ [default: localhost] │ -│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: OpenAI ──────────────────────────────────────────╮ -│ --asr-openai-model TEXT The OpenAI model to use for ASR │ -│ (transcription). │ -│ [default: whisper-1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Ollama (local) ──────────────────────────────────────────╮ -│ --llm-ollama-model TEXT The Ollama model to use. Default is │ -│ qwen3:4b. │ -│ [default: qwen3:4b] │ -│ --llm-ollama-host TEXT The Ollama server host. Default is │ -│ http://localhost:11434. │ -│ [default: http://localhost:11434] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: OpenAI ──────────────────────────────────────────────────╮ -│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ -│ [default: gpt-4o-mini] │ -│ --openai-api-key TEXT Your OpenAI API key. Can also be set with │ -│ the OPENAI_API_KEY environment variable. │ -│ [env var: OPENAI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Gemini ──────────────────────────────────────────────────╮ -│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ -│ [default: gemini-2.5-flash] │ -│ --gemini-api-key TEXT Your Gemini API key. Can also be set with │ -│ the GEMINI_API_KEY environment variable. │ -│ [env var: GEMINI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration ──────────────────────────────────────────────────────────╮ -│ --llm --no-llm Use an LLM to process the transcript. │ -│ [default: no-llm] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Process Management Options ─────────────────────────────────────────────────╮ -│ --stop Stop any running background process. │ -│ --status Check if a background process is running. │ -│ --toggle Toggle the background process on/off. If the process is │ -│ running, it will be stopped. If the process is not │ -│ running, it will be started. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ --clipboard --no-clipboard Copy result to clipboard. │ -│ [default: clipboard] │ -│ --log-level TEXT Set logging level. │ -│ [default: WARNING] │ -│ --log-file TEXT Path to a file to write │ -│ logs to. │ -│ [default: None] │ -│ --quiet -q Suppress console output │ -│ from rich. │ -│ --config TEXT Path to a TOML │ -│ configuration file. │ -│ [default: None] │ -│ --print-args Print the command line │ -│ arguments, including │ -│ variables taken from the │ -│ configuration file. │ -│ --transcription-log PATH Path to log transcription │ -│ results with timestamps, │ -│ hostname, model, and raw │ -│ output. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ──────────────────────────────────────────────────────────────────────────────╮ +│ --extra-instructions TEXT Additional instructions for the LLM to process the │ +│ transcription. │ +│ [default: None] │ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Provider Selection ───────────────────────────────────────────────────────────────────╮ +│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, 'openai'). │ +│ [default: local] │ +│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, 'openai', │ +│ 'gemini'). │ +│ [default: local] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration ────────────────────────────────────────────────────────────╮ +│ --input-device-index INTEGER Index of the PyAudio input device to use. │ +│ [default: None] │ +│ --input-device-name TEXT Device name keywords for partial matching. │ +│ [default: None] │ +│ --list-devices List available audio input and output devices and │ +│ exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: Wyoming (local) ───────────────────────────────────────────╮ +│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. [default: localhost] │ +│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: OpenAI ────────────────────────────────────────────────────╮ +│ --asr-openai-model TEXT The OpenAI model to use for ASR (transcription). │ +│ [default: whisper-1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Ollama (local) ────────────────────────────────────────────────────╮ +│ --llm-ollama-model TEXT The Ollama model to use. Default is qwen3:4b. │ +│ [default: qwen3:4b] │ +│ --llm-ollama-host TEXT The Ollama server host. Default is │ +│ http://localhost:11434. │ +│ [default: http://localhost:11434] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: OpenAI ────────────────────────────────────────────────────────────╮ +│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ +│ [default: gpt-4o-mini] │ +│ --openai-api-key TEXT Your OpenAI API key. Can also be set with the │ +│ OPENAI_API_KEY environment variable. │ +│ [env var: OPENAI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Gemini ────────────────────────────────────────────────────────────╮ +│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ +│ [default: gemini-2.5-flash] │ +│ --gemini-api-key TEXT Your Gemini API key. Can also be set with the │ +│ GEMINI_API_KEY environment variable. │ +│ [env var: GEMINI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration ────────────────────────────────────────────────────────────────────╮ +│ --llm --no-llm Use an LLM to process the transcript. [default: no-llm] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Process Management Options ───────────────────────────────────────────────────────────╮ +│ --stop Stop any running background process. │ +│ --status Check if a background process is running. │ +│ --toggle Toggle the background process on/off. If the process is running, it │ +│ will be stopped. If the process is not running, it will be started. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ --clipboard --no-clipboard Copy result to clipboard. │ +│ [default: clipboard] │ +│ --log-level TEXT Set logging level. [default: WARNING] │ +│ --log-file TEXT Path to a file to write logs to. │ +│ [default: None] │ +│ --quiet -q Suppress console output from rich. │ +│ --config TEXT Path to a TOML configuration file. │ +│ [default: None] │ +│ --print-args Print the command line arguments, │ +│ including variables taken from the │ +│ configuration file. │ +│ --transcription-log PATH Path to log transcription results │ +│ with timestamps, hostname, model, and │ +│ raw output. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -610,83 +593,90 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett Usage: agent-cli speak [OPTIONS] [TEXT] - Convert text to speech using Piper, Wyoming, OpenAI, or Kokoro TTS server. - - -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ text [TEXT] Text to speak. Reads from clipboard if not provided. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Provider Selection ─────────────────────────────────────────────────────────╮ -│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, │ -│ 'openai', 'kokoro'). │ -│ [default: local] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration ─────────────────────────────────────────╮ -│ --output-device-index INTEGER Index of the PyAudio output device to │ -│ use for TTS. │ -│ [default: None] │ -│ --output-device-name TEXT Output device name keywords for │ -│ partial matching. │ -│ [default: None] │ -│ --tts-speed FLOAT Speech speed multiplier (1.0 = normal, │ -│ 2.0 = twice as fast, 0.5 = half │ -│ speed). │ -│ [default: 1.0] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ────────────────────────╮ -│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ -│ [default: localhost] │ -│ --tts-wyoming-port INTEGER Wyoming TTS server port. │ -│ [default: 10200] │ -│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS │ -│ (e.g., 'en_US-lessac-medium'). │ -│ [default: None] │ -│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., │ -│ 'en_US'). │ -│ [default: None] │ -│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: OpenAI ─────────────────────────────────╮ -│ --tts-openai-model TEXT The OpenAI model to use for TTS. │ -│ [default: tts-1] │ -│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. │ -│ [default: alloy] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Kokoro ─────────────────────────────────╮ -│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. │ -│ [default: kokoro] │ -│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. │ -│ [default: af_sky] │ -│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ -│ [default: http://localhost:8880/v1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration ──────────────────────────────────────────────────╮ -│ --list-devices List available audio input and output devices and │ -│ exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ --save-file PATH Save TTS response audio to WAV file. │ -│ [default: None] │ -│ --log-level TEXT Set logging level. [default: WARNING] │ -│ --log-file TEXT Path to a file to write logs to. [default: None] │ -│ --quiet -q Suppress console output from rich. │ -│ --config TEXT Path to a TOML configuration file. │ -│ [default: None] │ -│ --print-args Print the command line arguments, including │ -│ variables taken from the configuration file. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Process Management Options ─────────────────────────────────────────────────╮ -│ --stop Stop any running background process. │ -│ --status Check if a background process is running. │ -│ --toggle Toggle the background process on/off. If the process is │ -│ running, it will be stopped. If the process is not │ -│ running, it will be started. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + Convert text to speech using Wyoming or OpenAI TTS server. + + +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ text [TEXT] Text to speak. Reads from clipboard if not provided. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ──────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Provider Selection ───────────────────────────────────────────────────────────────────╮ +│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, 'openai', │ +│ 'kokoro'). │ +│ [default: local] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration ───────────────────────────────────────────────────╮ +│ --output-device-index INTEGER Index of the PyAudio output device to use for │ +│ TTS. │ +│ [default: None] │ +│ --output-device-name TEXT Output device name keywords for partial │ +│ matching. │ +│ [default: None] │ +│ --tts-speed FLOAT Speech speed multiplier (1.0 = normal, 2.0 = │ +│ twice as fast, 0.5 = half speed). │ +│ [default: 1.0] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ──────────────────────────────────╮ +│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ +│ [default: localhost] │ +│ --tts-wyoming-port INTEGER Wyoming TTS server port. [default: 10200] │ +│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS (e.g., │ +│ 'en_US-lessac-medium'). │ +│ [default: None] │ +│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., 'en_US'). │ +│ [default: None] │ +│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: OpenAI ───────────────────────────────────────────╮ +│ --tts-openai-model TEXT The OpenAI model to use for TTS. [default: tts-1] │ +│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. [default: alloy] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Kokoro ───────────────────────────────────────────╮ +│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. [default: kokoro] │ +│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. [default: af_sky] │ +│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ +│ [default: http://localhost:8880/v1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Piper ────────────────────────────────────────────╮ +│ --tts-piper-host TEXT The base URL for the Piper HTTP server. │ +│ [default: http://localhost:10200] │ +│ --tts-piper-voice TEXT The voice to use for Piper TTS (optional). │ +│ [default: None] │ +│ --tts-piper-speaker TEXT The speaker to use for multi-speaker voices │ +│ (optional). │ +│ [default: None] │ +│ --tts-piper-speaker-id INTEGER The speaker ID to use for multi-speaker │ +│ voices (optional, overrides speaker). │ +│ [default: None] │ +│ --tts-piper-length-scale FLOAT Speaking speed (1.0 = normal speed). │ +│ [default: 1.0] │ +│ --tts-piper-noise-scale FLOAT Speaking variability (optional). │ +│ [default: None] │ +│ --tts-piper-noise-w-scale FLOAT Phoneme width variability (optional). │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration ────────────────────────────────────────────────────────────╮ +│ --list-devices List available audio input and output devices and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ --save-file PATH Save TTS response audio to WAV file. [default: None] │ +│ --log-level TEXT Set logging level. [default: WARNING] │ +│ --log-file TEXT Path to a file to write logs to. [default: None] │ +│ --quiet -q Suppress console output from rich. │ +│ --config TEXT Path to a TOML configuration file. [default: None] │ +│ --print-args Print the command line arguments, including variables │ +│ taken from the configuration file. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Process Management Options ───────────────────────────────────────────────────────────╮ +│ --stop Stop any running background process. │ +│ --status Check if a background process is running. │ +│ --toggle Toggle the background process on/off. If the process is running, it │ +│ will be stopped. If the process is not running, it will be started. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -727,140 +717,139 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett Usage: agent-cli voice-edit [OPTIONS] - Interact with clipboard text via a voice command using local or remote - services. - - Usage: - Run in foreground: agent-cli voice-edit --input-device-index 1 - Run - in background: agent-cli voice-edit --input-device-index 1 & - Check status: - agent-cli voice-edit --status - Stop background process: agent-cli voice-edit - --stop - List output devices: agent-cli voice-edit --list-output-devices - - Save TTS to file: agent-cli voice-edit --tts --save-file response.wav - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Provider Selection ─────────────────────────────────────────────────────────╮ -│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, │ -│ 'openai'). │ -│ [default: local] │ -│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, │ -│ 'openai', 'gemini'). │ -│ [default: local] │ -│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, │ -│ 'openai', 'kokoro'). │ -│ [default: local] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration ──────────────────────────────────────────────────╮ -│ --input-device-index INTEGER Index of the PyAudio input device to │ -│ use. │ -│ [default: None] │ -│ --input-device-name TEXT Device name keywords for partial │ -│ matching. │ -│ [default: None] │ -│ --list-devices List available audio input and output │ -│ devices and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: Wyoming (local) ─────────────────────────────────╮ -│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. │ -│ [default: localhost] │ -│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: OpenAI ──────────────────────────────────────────╮ -│ --asr-openai-model TEXT The OpenAI model to use for ASR │ -│ (transcription). │ -│ [default: whisper-1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Ollama (local) ──────────────────────────────────────────╮ -│ --llm-ollama-model TEXT The Ollama model to use. Default is │ -│ qwen3:4b. │ -│ [default: qwen3:4b] │ -│ --llm-ollama-host TEXT The Ollama server host. Default is │ -│ http://localhost:11434. │ -│ [default: http://localhost:11434] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: OpenAI ──────────────────────────────────────────────────╮ -│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ -│ [default: gpt-4o-mini] │ -│ --openai-api-key TEXT Your OpenAI API key. Can also be set with │ -│ the OPENAI_API_KEY environment variable. │ -│ [env var: OPENAI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Gemini ──────────────────────────────────────────────────╮ -│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ -│ [default: gemini-2.5-flash] │ -│ --gemini-api-key TEXT Your Gemini API key. Can also be set with │ -│ the GEMINI_API_KEY environment variable. │ -│ [env var: GEMINI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration ─────────────────────────────────────────╮ -│ --tts --no-tts Enable text-to-speech for │ -│ responses. │ -│ [default: no-tts] │ -│ --output-device-index INTEGER Index of the PyAudio output │ -│ device to use for TTS. │ -│ [default: None] │ -│ --output-device-name TEXT Output device name keywords │ -│ for partial matching. │ -│ [default: None] │ -│ --tts-speed FLOAT Speech speed multiplier (1.0 = │ -│ normal, 2.0 = twice as fast, │ -│ 0.5 = half speed). │ -│ [default: 1.0] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ────────────────────────╮ -│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ -│ [default: localhost] │ -│ --tts-wyoming-port INTEGER Wyoming TTS server port. │ -│ [default: 10200] │ -│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS │ -│ (e.g., 'en_US-lessac-medium'). │ -│ [default: None] │ -│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., │ -│ 'en_US'). │ -│ [default: None] │ -│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: OpenAI ─────────────────────────────────╮ -│ --tts-openai-model TEXT The OpenAI model to use for TTS. │ -│ [default: tts-1] │ -│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. │ -│ [default: alloy] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Kokoro ─────────────────────────────────╮ -│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. │ -│ [default: kokoro] │ -│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. │ -│ [default: af_sky] │ -│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ -│ [default: http://localhost:8880/v1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Process Management Options ─────────────────────────────────────────────────╮ -│ --stop Stop any running background process. │ -│ --status Check if a background process is running. │ -│ --toggle Toggle the background process on/off. If the process is │ -│ running, it will be stopped. If the process is not │ -│ running, it will be started. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ --save-file PATH Save TTS response audio to WAV │ -│ file. │ -│ [default: None] │ -│ --clipboard --no-clipboard Copy result to clipboard. │ -│ [default: clipboard] │ -│ --log-level TEXT Set logging level. │ -│ [default: WARNING] │ -│ --log-file TEXT Path to a file to write logs to. │ -│ [default: None] │ -│ --quiet -q Suppress console output from rich. │ -│ --config TEXT Path to a TOML configuration file. │ -│ [default: None] │ -│ --print-args Print the command line arguments, │ -│ including variables taken from the │ -│ configuration file. │ -╰──────────────────────────────────────────────────────────────────────────────╯ + Interact with clipboard text via a voice command using local or remote services. + + Usage: - Run in foreground: agent-cli voice-edit --input-device-index 1 - Run in + background: agent-cli voice-edit --input-device-index 1 & - Check status: agent-cli + voice-edit --status - Stop background process: agent-cli voice-edit --stop - List output + devices: agent-cli voice-edit --list-output-devices - Save TTS to file: agent-cli + voice-edit --tts --save-file response.wav + +╭─ Options ──────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Provider Selection ───────────────────────────────────────────────────────────────────╮ +│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, 'openai'). │ +│ [default: local] │ +│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, 'openai', │ +│ 'gemini'). │ +│ [default: local] │ +│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, 'openai', │ +│ 'kokoro'). │ +│ [default: local] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration ────────────────────────────────────────────────────────────╮ +│ --input-device-index INTEGER Index of the PyAudio input device to use. │ +│ [default: None] │ +│ --input-device-name TEXT Device name keywords for partial matching. │ +│ [default: None] │ +│ --list-devices List available audio input and output devices and │ +│ exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: Wyoming (local) ───────────────────────────────────────────╮ +│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. [default: localhost] │ +│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: OpenAI ────────────────────────────────────────────────────╮ +│ --asr-openai-model TEXT The OpenAI model to use for ASR (transcription). │ +│ [default: whisper-1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Ollama (local) ────────────────────────────────────────────────────╮ +│ --llm-ollama-model TEXT The Ollama model to use. Default is qwen3:4b. │ +│ [default: qwen3:4b] │ +│ --llm-ollama-host TEXT The Ollama server host. Default is │ +│ http://localhost:11434. │ +│ [default: http://localhost:11434] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: OpenAI ────────────────────────────────────────────────────────────╮ +│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ +│ [default: gpt-4o-mini] │ +│ --openai-api-key TEXT Your OpenAI API key. Can also be set with the │ +│ OPENAI_API_KEY environment variable. │ +│ [env var: OPENAI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Gemini ────────────────────────────────────────────────────────────╮ +│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ +│ [default: gemini-2.5-flash] │ +│ --gemini-api-key TEXT Your Gemini API key. Can also be set with the │ +│ GEMINI_API_KEY environment variable. │ +│ [env var: GEMINI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration ───────────────────────────────────────────────────╮ +│ --tts --no-tts Enable text-to-speech for responses. │ +│ [default: no-tts] │ +│ --output-device-index INTEGER Index of the PyAudio output device to │ +│ use for TTS. │ +│ [default: None] │ +│ --output-device-name TEXT Output device name keywords for partial │ +│ matching. │ +│ [default: None] │ +│ --tts-speed FLOAT Speech speed multiplier (1.0 = normal, │ +│ 2.0 = twice as fast, 0.5 = half speed). │ +│ [default: 1.0] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ──────────────────────────────────╮ +│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ +│ [default: localhost] │ +│ --tts-wyoming-port INTEGER Wyoming TTS server port. [default: 10200] │ +│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS (e.g., │ +│ 'en_US-lessac-medium'). │ +│ [default: None] │ +│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., 'en_US'). │ +│ [default: None] │ +│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: OpenAI ───────────────────────────────────────────╮ +│ --tts-openai-model TEXT The OpenAI model to use for TTS. [default: tts-1] │ +│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. [default: alloy] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Kokoro ───────────────────────────────────────────╮ +│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. [default: kokoro] │ +│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. [default: af_sky] │ +│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ +│ [default: http://localhost:8880/v1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Piper ────────────────────────────────────────────╮ +│ --tts-piper-host TEXT The base URL for the Piper HTTP server. │ +│ [default: http://localhost:10200] │ +│ --tts-piper-voice TEXT The voice to use for Piper TTS (optional). │ +│ [default: None] │ +│ --tts-piper-speaker TEXT The speaker to use for multi-speaker voices │ +│ (optional). │ +│ [default: None] │ +│ --tts-piper-speaker-id INTEGER The speaker ID to use for multi-speaker │ +│ voices (optional, overrides speaker). │ +│ [default: None] │ +│ --tts-piper-length-scale FLOAT Speaking speed (1.0 = normal speed). │ +│ [default: 1.0] │ +│ --tts-piper-noise-scale FLOAT Speaking variability (optional). │ +│ [default: None] │ +│ --tts-piper-noise-w-scale FLOAT Phoneme width variability (optional). │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Process Management Options ───────────────────────────────────────────────────────────╮ +│ --stop Stop any running background process. │ +│ --status Check if a background process is running. │ +│ --toggle Toggle the background process on/off. If the process is running, it │ +│ will be stopped. If the process is not running, it will be started. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ --save-file PATH Save TTS response audio to WAV file. │ +│ [default: None] │ +│ --clipboard --no-clipboard Copy result to clipboard. │ +│ [default: clipboard] │ +│ --log-level TEXT Set logging level. [default: WARNING] │ +│ --log-file TEXT Path to a file to write logs to. │ +│ [default: None] │ +│ --quiet -q Suppress console output from rich. │ +│ --config TEXT Path to a TOML configuration file. │ +│ [default: None] │ +│ --print-args Print the command line arguments, including │ +│ variables taken from the configuration file. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -907,140 +896,139 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett Wake word-based voice assistant using local or remote services. -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Provider Selection ─────────────────────────────────────────────────────────╮ -│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, │ -│ 'openai'). │ -│ [default: local] │ -│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, │ -│ 'openai', 'gemini'). │ -│ [default: local] │ -│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, │ -│ 'openai', 'kokoro'). │ -│ [default: local] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Wake Word Options ──────────────────────────────────────────────────────────╮ -│ --wake-server-ip TEXT Wyoming wake word server IP address. │ -│ [default: localhost] │ -│ --wake-server-port INTEGER Wyoming wake word server port. │ -│ [default: 10400] │ -│ --wake-word TEXT Name of wake word to detect (e.g., │ -│ 'ok_nabu', 'hey_jarvis'). │ -│ [default: ok_nabu] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration ──────────────────────────────────────────────────╮ -│ --input-device-index INTEGER Index of the PyAudio input device to │ -│ use. │ -│ [default: None] │ -│ --input-device-name TEXT Device name keywords for partial │ -│ matching. │ -│ [default: None] │ -│ --list-devices List available audio input and output │ -│ devices and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: Wyoming (local) ─────────────────────────────────╮ -│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. │ -│ [default: localhost] │ -│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: OpenAI ──────────────────────────────────────────╮ -│ --asr-openai-model TEXT The OpenAI model to use for ASR │ -│ (transcription). │ -│ [default: whisper-1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Ollama (local) ──────────────────────────────────────────╮ -│ --llm-ollama-model TEXT The Ollama model to use. Default is │ -│ qwen3:4b. │ -│ [default: qwen3:4b] │ -│ --llm-ollama-host TEXT The Ollama server host. Default is │ -│ http://localhost:11434. │ -│ [default: http://localhost:11434] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: OpenAI ──────────────────────────────────────────────────╮ -│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ -│ [default: gpt-4o-mini] │ -│ --openai-api-key TEXT Your OpenAI API key. Can also be set with │ -│ the OPENAI_API_KEY environment variable. │ -│ [env var: OPENAI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Gemini ──────────────────────────────────────────────────╮ -│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ -│ [default: gemini-2.5-flash] │ -│ --gemini-api-key TEXT Your Gemini API key. Can also be set with │ -│ the GEMINI_API_KEY environment variable. │ -│ [env var: GEMINI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration ─────────────────────────────────────────╮ -│ --tts --no-tts Enable text-to-speech for │ -│ responses. │ -│ [default: no-tts] │ -│ --output-device-index INTEGER Index of the PyAudio output │ -│ device to use for TTS. │ -│ [default: None] │ -│ --output-device-name TEXT Output device name keywords │ -│ for partial matching. │ -│ [default: None] │ -│ --tts-speed FLOAT Speech speed multiplier (1.0 = │ -│ normal, 2.0 = twice as fast, │ -│ 0.5 = half speed). │ -│ [default: 1.0] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ────────────────────────╮ -│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ -│ [default: localhost] │ -│ --tts-wyoming-port INTEGER Wyoming TTS server port. │ -│ [default: 10200] │ -│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS │ -│ (e.g., 'en_US-lessac-medium'). │ -│ [default: None] │ -│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., │ -│ 'en_US'). │ -│ [default: None] │ -│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: OpenAI ─────────────────────────────────╮ -│ --tts-openai-model TEXT The OpenAI model to use for TTS. │ -│ [default: tts-1] │ -│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. │ -│ [default: alloy] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Kokoro ─────────────────────────────────╮ -│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. │ -│ [default: kokoro] │ -│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. │ -│ [default: af_sky] │ -│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ -│ [default: http://localhost:8880/v1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Process Management Options ─────────────────────────────────────────────────╮ -│ --stop Stop any running background process. │ -│ --status Check if a background process is running. │ -│ --toggle Toggle the background process on/off. If the process is │ -│ running, it will be stopped. If the process is not │ -│ running, it will be started. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ --save-file PATH Save TTS response audio to WAV │ -│ file. │ -│ [default: None] │ -│ --clipboard --no-clipboard Copy result to clipboard. │ -│ [default: clipboard] │ -│ --log-level TEXT Set logging level. │ -│ [default: WARNING] │ -│ --log-file TEXT Path to a file to write logs to. │ -│ [default: None] │ -│ --quiet -q Suppress console output from rich. │ -│ --config TEXT Path to a TOML configuration file. │ -│ [default: None] │ -│ --print-args Print the command line arguments, │ -│ including variables taken from the │ -│ configuration file. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ──────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Provider Selection ───────────────────────────────────────────────────────────────────╮ +│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, 'openai'). │ +│ [default: local] │ +│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, 'openai', │ +│ 'gemini'). │ +│ [default: local] │ +│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, 'openai', │ +│ 'kokoro'). │ +│ [default: local] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Wake Word Options ────────────────────────────────────────────────────────────────────╮ +│ --wake-server-ip TEXT Wyoming wake word server IP address. │ +│ [default: localhost] │ +│ --wake-server-port INTEGER Wyoming wake word server port. [default: 10400] │ +│ --wake-word TEXT Name of wake word to detect (e.g., 'ok_nabu', │ +│ 'hey_jarvis'). │ +│ [default: ok_nabu] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration ────────────────────────────────────────────────────────────╮ +│ --input-device-index INTEGER Index of the PyAudio input device to use. │ +│ [default: None] │ +│ --input-device-name TEXT Device name keywords for partial matching. │ +│ [default: None] │ +│ --list-devices List available audio input and output devices and │ +│ exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: Wyoming (local) ───────────────────────────────────────────╮ +│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. [default: localhost] │ +│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: OpenAI ────────────────────────────────────────────────────╮ +│ --asr-openai-model TEXT The OpenAI model to use for ASR (transcription). │ +│ [default: whisper-1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Ollama (local) ────────────────────────────────────────────────────╮ +│ --llm-ollama-model TEXT The Ollama model to use. Default is qwen3:4b. │ +│ [default: qwen3:4b] │ +│ --llm-ollama-host TEXT The Ollama server host. Default is │ +│ http://localhost:11434. │ +│ [default: http://localhost:11434] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: OpenAI ────────────────────────────────────────────────────────────╮ +│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ +│ [default: gpt-4o-mini] │ +│ --openai-api-key TEXT Your OpenAI API key. Can also be set with the │ +│ OPENAI_API_KEY environment variable. │ +│ [env var: OPENAI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Gemini ────────────────────────────────────────────────────────────╮ +│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ +│ [default: gemini-2.5-flash] │ +│ --gemini-api-key TEXT Your Gemini API key. Can also be set with the │ +│ GEMINI_API_KEY environment variable. │ +│ [env var: GEMINI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration ───────────────────────────────────────────────────╮ +│ --tts --no-tts Enable text-to-speech for responses. │ +│ [default: no-tts] │ +│ --output-device-index INTEGER Index of the PyAudio output device to │ +│ use for TTS. │ +│ [default: None] │ +│ --output-device-name TEXT Output device name keywords for partial │ +│ matching. │ +│ [default: None] │ +│ --tts-speed FLOAT Speech speed multiplier (1.0 = normal, │ +│ 2.0 = twice as fast, 0.5 = half speed). │ +│ [default: 1.0] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ──────────────────────────────────╮ +│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ +│ [default: localhost] │ +│ --tts-wyoming-port INTEGER Wyoming TTS server port. [default: 10200] │ +│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS (e.g., │ +│ 'en_US-lessac-medium'). │ +│ [default: None] │ +│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., 'en_US'). │ +│ [default: None] │ +│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: OpenAI ───────────────────────────────────────────╮ +│ --tts-openai-model TEXT The OpenAI model to use for TTS. [default: tts-1] │ +│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. [default: alloy] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Kokoro ───────────────────────────────────────────╮ +│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. [default: kokoro] │ +│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. [default: af_sky] │ +│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ +│ [default: http://localhost:8880/v1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Piper ────────────────────────────────────────────╮ +│ --tts-piper-host TEXT The base URL for the Piper HTTP server. │ +│ [default: http://localhost:10200] │ +│ --tts-piper-voice TEXT The voice to use for Piper TTS (optional). │ +│ [default: None] │ +│ --tts-piper-speaker TEXT The speaker to use for multi-speaker voices │ +│ (optional). │ +│ [default: None] │ +│ --tts-piper-speaker-id INTEGER The speaker ID to use for multi-speaker │ +│ voices (optional, overrides speaker). │ +│ [default: None] │ +│ --tts-piper-length-scale FLOAT Speaking speed (1.0 = normal speed). │ +│ [default: 1.0] │ +│ --tts-piper-noise-scale FLOAT Speaking variability (optional). │ +│ [default: None] │ +│ --tts-piper-noise-w-scale FLOAT Phoneme width variability (optional). │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Process Management Options ───────────────────────────────────────────────────────────╮ +│ --stop Stop any running background process. │ +│ --status Check if a background process is running. │ +│ --toggle Toggle the background process on/off. If the process is running, it │ +│ will be stopped. If the process is not running, it will be started. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ --save-file PATH Save TTS response audio to WAV file. │ +│ [default: None] │ +│ --clipboard --no-clipboard Copy result to clipboard. │ +│ [default: clipboard] │ +│ --log-level TEXT Set logging level. [default: WARNING] │ +│ --log-file TEXT Path to a file to write logs to. │ +│ [default: None] │ +│ --quiet -q Suppress console output from rich. │ +│ --config TEXT Path to a TOML configuration file. │ +│ [default: None] │ +│ --print-args Print the command line arguments, including │ +│ variables taken from the configuration file. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ``` @@ -1094,133 +1082,133 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett An chat agent that you can talk to. -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Provider Selection ─────────────────────────────────────────────────────────╮ -│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, │ -│ 'openai'). │ -│ [default: local] │ -│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, │ -│ 'openai', 'gemini'). │ -│ [default: local] │ -│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, │ -│ 'openai', 'kokoro'). │ -│ [default: local] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration ──────────────────────────────────────────────────╮ -│ --input-device-index INTEGER Index of the PyAudio input device to │ -│ use. │ -│ [default: None] │ -│ --input-device-name TEXT Device name keywords for partial │ -│ matching. │ -│ [default: None] │ -│ --list-devices List available audio input and output │ -│ devices and exit. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: Wyoming (local) ─────────────────────────────────╮ -│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. │ -│ [default: localhost] │ -│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ ASR (Audio) Configuration: OpenAI ──────────────────────────────────────────╮ -│ --asr-openai-model TEXT The OpenAI model to use for ASR │ -│ (transcription). │ -│ [default: whisper-1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Ollama (local) ──────────────────────────────────────────╮ -│ --llm-ollama-model TEXT The Ollama model to use. Default is │ -│ qwen3:4b. │ -│ [default: qwen3:4b] │ -│ --llm-ollama-host TEXT The Ollama server host. Default is │ -│ http://localhost:11434. │ -│ [default: http://localhost:11434] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: OpenAI ──────────────────────────────────────────────────╮ -│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ -│ [default: gpt-4o-mini] │ -│ --openai-api-key TEXT Your OpenAI API key. Can also be set with │ -│ the OPENAI_API_KEY environment variable. │ -│ [env var: OPENAI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ LLM Configuration: Gemini ──────────────────────────────────────────────────╮ -│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ -│ [default: gemini-2.5-flash] │ -│ --gemini-api-key TEXT Your Gemini API key. Can also be set with │ -│ the GEMINI_API_KEY environment variable. │ -│ [env var: GEMINI_API_KEY] │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration ─────────────────────────────────────────╮ -│ --tts --no-tts Enable text-to-speech for │ -│ responses. │ -│ [default: no-tts] │ -│ --output-device-index INTEGER Index of the PyAudio output │ -│ device to use for TTS. │ -│ [default: None] │ -│ --output-device-name TEXT Output device name keywords │ -│ for partial matching. │ -│ [default: None] │ -│ --tts-speed FLOAT Speech speed multiplier (1.0 = │ -│ normal, 2.0 = twice as fast, │ -│ 0.5 = half speed). │ -│ [default: 1.0] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ────────────────────────╮ -│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ -│ [default: localhost] │ -│ --tts-wyoming-port INTEGER Wyoming TTS server port. │ -│ [default: 10200] │ -│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS │ -│ (e.g., 'en_US-lessac-medium'). │ -│ [default: None] │ -│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., │ -│ 'en_US'). │ -│ [default: None] │ -│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ -│ [default: None] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: OpenAI ─────────────────────────────────╮ -│ --tts-openai-model TEXT The OpenAI model to use for TTS. │ -│ [default: tts-1] │ -│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. │ -│ [default: alloy] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ TTS (Text-to-Speech) Configuration: Kokoro ─────────────────────────────────╮ -│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. │ -│ [default: kokoro] │ -│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. │ -│ [default: af_sky] │ -│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ -│ [default: http://localhost:8880/v1] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Process Management Options ─────────────────────────────────────────────────╮ -│ --stop Stop any running background process. │ -│ --status Check if a background process is running. │ -│ --toggle Toggle the background process on/off. If the process is │ -│ running, it will be stopped. If the process is not │ -│ running, it will be started. │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ History Options ────────────────────────────────────────────────────────────╮ -│ --history-dir PATH Directory to store conversation history. │ -│ [default: ~/.config/agent-cli/history] │ -│ --last-n-messages INTEGER Number of messages to include in the │ -│ conversation history. Set to 0 to disable │ -│ history. │ -│ [default: 50] │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ General Options ────────────────────────────────────────────────────────────╮ -│ --save-file PATH Save TTS response audio to WAV file. │ -│ [default: None] │ -│ --log-level TEXT Set logging level. [default: WARNING] │ -│ --log-file TEXT Path to a file to write logs to. [default: None] │ -│ --quiet -q Suppress console output from rich. │ -│ --config TEXT Path to a TOML configuration file. │ -│ [default: None] │ -│ --print-args Print the command line arguments, including │ -│ variables taken from the configuration file. │ -╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ──────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Provider Selection ───────────────────────────────────────────────────────────────────╮ +│ --asr-provider TEXT The ASR provider to use ('local' for Wyoming, 'openai'). │ +│ [default: local] │ +│ --llm-provider TEXT The LLM provider to use ('local' for Ollama, 'openai', │ +│ 'gemini'). │ +│ [default: local] │ +│ --tts-provider TEXT The TTS provider to use ('local' for Wyoming, 'openai', │ +│ 'kokoro'). │ +│ [default: local] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration ────────────────────────────────────────────────────────────╮ +│ --input-device-index INTEGER Index of the PyAudio input device to use. │ +│ [default: None] │ +│ --input-device-name TEXT Device name keywords for partial matching. │ +│ [default: None] │ +│ --list-devices List available audio input and output devices and │ +│ exit. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: Wyoming (local) ───────────────────────────────────────────╮ +│ --asr-wyoming-ip TEXT Wyoming ASR server IP address. [default: localhost] │ +│ --asr-wyoming-port INTEGER Wyoming ASR server port. [default: 10300] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ ASR (Audio) Configuration: OpenAI ────────────────────────────────────────────────────╮ +│ --asr-openai-model TEXT The OpenAI model to use for ASR (transcription). │ +│ [default: whisper-1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Ollama (local) ────────────────────────────────────────────────────╮ +│ --llm-ollama-model TEXT The Ollama model to use. Default is qwen3:4b. │ +│ [default: qwen3:4b] │ +│ --llm-ollama-host TEXT The Ollama server host. Default is │ +│ http://localhost:11434. │ +│ [default: http://localhost:11434] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: OpenAI ────────────────────────────────────────────────────────────╮ +│ --llm-openai-model TEXT The OpenAI model to use for LLM tasks. │ +│ [default: gpt-4o-mini] │ +│ --openai-api-key TEXT Your OpenAI API key. Can also be set with the │ +│ OPENAI_API_KEY environment variable. │ +│ [env var: OPENAI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ LLM Configuration: Gemini ────────────────────────────────────────────────────────────╮ +│ --llm-gemini-model TEXT The Gemini model to use for LLM tasks. │ +│ [default: gemini-2.5-flash] │ +│ --gemini-api-key TEXT Your Gemini API key. Can also be set with the │ +│ GEMINI_API_KEY environment variable. │ +│ [env var: GEMINI_API_KEY] │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration ───────────────────────────────────────────────────╮ +│ --tts --no-tts Enable text-to-speech for responses. │ +│ [default: no-tts] │ +│ --output-device-index INTEGER Index of the PyAudio output device to │ +│ use for TTS. │ +│ [default: None] │ +│ --output-device-name TEXT Output device name keywords for partial │ +│ matching. │ +│ [default: None] │ +│ --tts-speed FLOAT Speech speed multiplier (1.0 = normal, │ +│ 2.0 = twice as fast, 0.5 = half speed). │ +│ [default: 1.0] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Wyoming (local) ──────────────────────────────────╮ +│ --tts-wyoming-ip TEXT Wyoming TTS server IP address. │ +│ [default: localhost] │ +│ --tts-wyoming-port INTEGER Wyoming TTS server port. [default: 10200] │ +│ --tts-wyoming-voice TEXT Voice name to use for Wyoming TTS (e.g., │ +│ 'en_US-lessac-medium'). │ +│ [default: None] │ +│ --tts-wyoming-language TEXT Language for Wyoming TTS (e.g., 'en_US'). │ +│ [default: None] │ +│ --tts-wyoming-speaker TEXT Speaker name for Wyoming TTS voice. │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: OpenAI ───────────────────────────────────────────╮ +│ --tts-openai-model TEXT The OpenAI model to use for TTS. [default: tts-1] │ +│ --tts-openai-voice TEXT The voice to use for OpenAI TTS. [default: alloy] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Kokoro ───────────────────────────────────────────╮ +│ --tts-kokoro-model TEXT The Kokoro model to use for TTS. [default: kokoro] │ +│ --tts-kokoro-voice TEXT The voice to use for Kokoro TTS. [default: af_sky] │ +│ --tts-kokoro-host TEXT The base URL for the Kokoro API. │ +│ [default: http://localhost:8880/v1] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ TTS (Text-to-Speech) Configuration: Piper ────────────────────────────────────────────╮ +│ --tts-piper-host TEXT The base URL for the Piper HTTP server. │ +│ [default: http://localhost:10200] │ +│ --tts-piper-voice TEXT The voice to use for Piper TTS (optional). │ +│ [default: None] │ +│ --tts-piper-speaker TEXT The speaker to use for multi-speaker voices │ +│ (optional). │ +│ [default: None] │ +│ --tts-piper-speaker-id INTEGER The speaker ID to use for multi-speaker │ +│ voices (optional, overrides speaker). │ +│ [default: None] │ +│ --tts-piper-length-scale FLOAT Speaking speed (1.0 = normal speed). │ +│ [default: 1.0] │ +│ --tts-piper-noise-scale FLOAT Speaking variability (optional). │ +│ [default: None] │ +│ --tts-piper-noise-w-scale FLOAT Phoneme width variability (optional). │ +│ [default: None] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Process Management Options ───────────────────────────────────────────────────────────╮ +│ --stop Stop any running background process. │ +│ --status Check if a background process is running. │ +│ --toggle Toggle the background process on/off. If the process is running, it │ +│ will be stopped. If the process is not running, it will be started. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ History Options ──────────────────────────────────────────────────────────────────────╮ +│ --history-dir PATH Directory to store conversation history. │ +│ [default: ~/.config/agent-cli/history] │ +│ --last-n-messages INTEGER Number of messages to include in the conversation │ +│ history. Set to 0 to disable history. │ +│ [default: 50] │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ General Options ──────────────────────────────────────────────────────────────────────╮ +│ --save-file PATH Save TTS response audio to WAV file. [default: None] │ +│ --log-level TEXT Set logging level. [default: WARNING] │ +│ --log-file TEXT Path to a file to write logs to. [default: None] │ +│ --quiet -q Suppress console output from rich. │ +│ --config TEXT Path to a TOML configuration file. [default: None] │ +│ --print-args Print the command line arguments, including variables │ +│ taken from the configuration file. │ +╰────────────────────────────────────────────────────────────────────────────────────────╯ ```