Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
<!-- CODE:END -->
<!-- OUTPUT:START -->
<!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->

```yaml


Expand Down Expand Up @@ -370,7 +369,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
<!-- CODE:END -->
<!-- OUTPUT:START -->
<!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->

```yaml


Expand Down Expand Up @@ -483,7 +481,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
<!-- CODE:END -->
<!-- OUTPUT:START -->
<!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->

```yaml


Expand Down Expand Up @@ -607,7 +604,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
<!-- CODE:END -->
<!-- OUTPUT:START -->
<!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->

```yaml


Expand Down Expand Up @@ -773,7 +769,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
<!-- CODE:END -->
<!-- OUTPUT:START -->
<!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->

```yaml


Expand Down Expand Up @@ -949,7 +944,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
<!-- CODE:END -->
<!-- OUTPUT:START -->
<!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->

```yaml


Expand Down
99 changes: 79 additions & 20 deletions agent_cli/services/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
has_audiostretchy = importlib.util.find_spec("audiostretchy") is not None


KOKORO_STREAM_RATE = 24000
KOKORO_STREAM_WIDTH = 2 # Corresponds to pyaudio.paInt16
KOKORO_STREAM_CHANNELS = 1


def get_synthesizer(
provider_config: config.ProviderSelection,
audio_output_config: config.AudioOutput,
Expand All @@ -56,11 +61,6 @@ def get_synthesizer(
openai_tts_config=openai_tts_config,
openai_llm_config=openai_llm_config,
)
if provider_config.tts_provider == "kokoro":
return partial(
_synthesize_speech_kokoro,
kokoro_tts_config=kokoro_tts_config,
)
return partial(_synthesize_speech_wyoming, wyoming_tts_config=wyoming_tts_config)


Expand Down Expand Up @@ -224,28 +224,75 @@ async def _synthesize_speech_openai(
)


async def _synthesize_speech_kokoro(
async def _stream_and_play_kokoro(
*,
text: str,
kokoro_tts_config: config.KokoroTTS,
audio_output_config: config.AudioOutput,
logger: logging.Logger,
**_kwargs: object,
play_audio_flag: bool,
quiet: bool = False,
stop_event: InteractiveStopEvent | None = None,
live: Live,
) -> bytes | None:
"""Synthesize speech from text using Kokoro TTS server."""
"""Stream and play audio from Kokoro TTS, returning the buffered WAV data."""
client = AsyncOpenAI(
api_key="not-needed",
base_url=kokoro_tts_config.kokoro_tts_host,
)
audio_buffer = io.BytesIO()

try:
client = AsyncOpenAI(
api_key="not-needed",
base_url=kokoro_tts_config.kokoro_tts_host,
)
response = await client.audio.speech.create(
model=kokoro_tts_config.kokoro_tts_model,
voice=kokoro_tts_config.kokoro_tts_voice,
input=text,
response_format="wav",
async with live_timer(live, "🔊 Synthesizing text", style="blue", quiet=quiet):
async with client.audio.speech.with_streaming_response.create(
model=kokoro_tts_config.kokoro_tts_model,
voice=kokoro_tts_config.kokoro_tts_voice,
input=text,
response_format="pcm",
) as response:
if play_audio_flag:
with pyaudio_context() as p:
stream_config = setup_output_stream(
audio_output_config.output_device_index,
sample_rate=KOKORO_STREAM_RATE,
sample_width=KOKORO_STREAM_WIDTH,
channels=KOKORO_STREAM_CHANNELS,
)
with open_pyaudio_stream(p, **stream_config) as stream:
logger.info("Starting Kokoro TTS stream playback.")
async for chunk in response.aiter_bytes(chunk_size=1024):
if stop_event and stop_event.is_set():
break
stream.write(chunk)
audio_buffer.write(chunk)
await asyncio.sleep(0)
else:
# Just buffer the data without playing
async for chunk in response.aiter_bytes():
audio_buffer.write(chunk)

if stop_event and stop_event.is_set():
logger.info("Audio playback interrupted")
if not quiet:
print_with_style("⏹️ Audio playback interrupted", style="yellow")
elif play_audio_flag and not quiet:
print_with_style("✅ Audio playback finished")

pcm_data = audio_buffer.getvalue()
if not pcm_data:
return None

return _create_wav_data(
pcm_data,
KOKORO_STREAM_RATE,
KOKORO_STREAM_WIDTH,
KOKORO_STREAM_CHANNELS,
)
return await response.aread()
except Exception:
logger.exception("Error during Kokoro speech synthesis")

except Exception as e:
logger.exception("Error during Kokoro speech synthesis or playback")
if not quiet:
print_error_message(f"Kokoro TTS error: {e}")
return None


Expand Down Expand Up @@ -376,6 +423,18 @@ async def _speak_text(
live: Live,
) -> bytes | None:
"""Synthesize and optionally play speech from text."""
if provider_config.tts_provider == "kokoro":
return await _stream_and_play_kokoro(
text=text,
kokoro_tts_config=kokoro_tts_config,
audio_output_config=audio_output_config,
logger=logger,
quiet=quiet,
play_audio_flag=play_audio_flag,
stop_event=stop_event,
live=live,
)

synthesizer = get_synthesizer(
provider_config,
audio_output_config,
Expand Down
33 changes: 0 additions & 33 deletions tests/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,39 +133,6 @@ def test_get_synthesizer_wyoming() -> None:
assert synthesizer.func == tts._synthesize_speech_wyoming # type: ignore[attr-defined]


def test_get_synthesizer_kokoro() -> None:
"""Test that get_synthesizer returns the Kokoro synthesizer."""
provider_config = config.ProviderSelection(
asr_provider="local",
llm_provider="local",
tts_provider="kokoro",
)
audio_output_config = config.AudioOutput(enable_tts=True)
wyoming_tts_config = config.WyomingTTS(
wyoming_tts_ip="localhost",
wyoming_tts_port=1234,
)
openai_tts_config = config.OpenAITTS(openai_tts_model="tts-1", openai_tts_voice="alloy")
openai_llm_config = config.OpenAILLM(
openai_llm_model="gpt-4o-mini",
openai_api_key="test_api_key",
)
kokoro_tts_cfg = config.KokoroTTS(
kokoro_tts_model="tts-1",
kokoro_tts_voice="alloy",
kokoro_tts_host="http://localhost:8000/v1",
)
synthesizer = tts.get_synthesizer(
provider_config,
audio_output_config,
wyoming_tts_config,
openai_tts_config,
openai_llm_config,
kokoro_tts_cfg,
)
assert synthesizer.func == tts._synthesize_speech_kokoro # type: ignore[attr-defined]


@pytest.mark.asyncio
async def test_transcribe_audio_openai_no_key():
"""Test that transcribe_audio_openai fails without an API key."""
Expand Down
Loading