basnijholt · basnijholt · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/README.md b/README.md
@@ -289,7 +289,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
 <!-- CODE:END -->
 <!-- OUTPUT:START -->
 <!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->
-
 ```yaml
 
 
@@ -370,7 +369,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
 <!-- CODE:END -->
 <!-- OUTPUT:START -->
 <!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->
-
 ```yaml
 
 
@@ -483,7 +481,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
 <!-- CODE:END -->
 <!-- OUTPUT:START -->
 <!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->
-
 ```yaml
 
 
@@ -607,7 +604,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
 <!-- CODE:END -->
 <!-- OUTPUT:START -->
 <!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->
-
 ```yaml
 
 
@@ -773,7 +769,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
 <!-- CODE:END -->
 <!-- OUTPUT:START -->
 <!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->
-
 ```yaml
 
 
@@ -949,7 +944,6 @@ You can choose to use local services (Wyoming/Ollama) or OpenAI services by sett
 <!-- CODE:END -->
 <!-- OUTPUT:START -->
 <!-- ⚠️ This content is auto-generated by `markdown-code-runner`. -->
-
 ```yaml
 
 

diff --git a/agent_cli/services/tts.py b/agent_cli/services/tts.py
@@ -39,6 +39,11 @@
 has_audiostretchy = importlib.util.find_spec("audiostretchy") is not None
 
 
+KOKORO_STREAM_RATE = 24000
+KOKORO_STREAM_WIDTH = 2  # Corresponds to pyaudio.paInt16
+KOKORO_STREAM_CHANNELS = 1
+
+
 def get_synthesizer(
     provider_config: config.ProviderSelection,
     audio_output_config: config.AudioOutput,
@@ -56,11 +61,6 @@ def get_synthesizer(
             openai_tts_config=openai_tts_config,
             openai_llm_config=openai_llm_config,
         )
-    if provider_config.tts_provider == "kokoro":
-        return partial(
-            _synthesize_speech_kokoro,
-            kokoro_tts_config=kokoro_tts_config,
-        )
     return partial(_synthesize_speech_wyoming, wyoming_tts_config=wyoming_tts_config)
 
 
@@ -224,28 +224,75 @@ async def _synthesize_speech_openai(
     )
 
 
-async def _synthesize_speech_kokoro(
+async def _stream_and_play_kokoro(
     *,
     text: str,
     kokoro_tts_config: config.KokoroTTS,
+    audio_output_config: config.AudioOutput,
     logger: logging.Logger,
-    **_kwargs: object,
+    play_audio_flag: bool,
+    quiet: bool = False,
+    stop_event: InteractiveStopEvent | None = None,
+    live: Live,
 ) -> bytes | None:
-    """Synthesize speech from text using Kokoro TTS server."""
+    """Stream and play audio from Kokoro TTS, returning the buffered WAV data."""
+    client = AsyncOpenAI(
+        api_key="not-needed",
+        base_url=kokoro_tts_config.kokoro_tts_host,
+    )
+    audio_buffer = io.BytesIO()
+
     try:
-        client = AsyncOpenAI(
-            api_key="not-needed",
-            base_url=kokoro_tts_config.kokoro_tts_host,
-        )
-        response = await client.audio.speech.create(
-            model=kokoro_tts_config.kokoro_tts_model,
-            voice=kokoro_tts_config.kokoro_tts_voice,
-            input=text,
-            response_format="wav",
+        async with live_timer(live, "🔊 Synthesizing text", style="blue", quiet=quiet):
+            async with client.audio.speech.with_streaming_response.create(
+                model=kokoro_tts_config.kokoro_tts_model,
+                voice=kokoro_tts_config.kokoro_tts_voice,
+                input=text,
+                response_format="pcm",
+            ) as response:
+                if play_audio_flag:
+                    with pyaudio_context() as p:
+                        stream_config = setup_output_stream(
+                            audio_output_config.output_device_index,
+                            sample_rate=KOKORO_STREAM_RATE,
+                            sample_width=KOKORO_STREAM_WIDTH,
+                            channels=KOKORO_STREAM_CHANNELS,
+                        )
+                        with open_pyaudio_stream(p, **stream_config) as stream:
+                            logger.info("Starting Kokoro TTS stream playback.")
+                            async for chunk in response.aiter_bytes(chunk_size=1024):
+                                if stop_event and stop_event.is_set():
+                                    break
+                                stream.write(chunk)
+                                audio_buffer.write(chunk)
+                                await asyncio.sleep(0)
+                else:
+                    # Just buffer the data without playing
+                    async for chunk in response.aiter_bytes():
+                        audio_buffer.write(chunk)
+
+        if stop_event and stop_event.is_set():
+            logger.info("Audio playback interrupted")
+            if not quiet:
+                print_with_style("⏹️ Audio playback interrupted", style="yellow")
+        elif play_audio_flag and not quiet:
+            print_with_style("✅ Audio playback finished")
+
+        pcm_data = audio_buffer.getvalue()
+        if not pcm_data:
+            return None
+
+        return _create_wav_data(
+            pcm_data,
+            KOKORO_STREAM_RATE,
+            KOKORO_STREAM_WIDTH,
+            KOKORO_STREAM_CHANNELS,
         )
-        return await response.aread()
-    except Exception:
-        logger.exception("Error during Kokoro speech synthesis")
+
+    except Exception as e:
+        logger.exception("Error during Kokoro speech synthesis or playback")
+        if not quiet:
+            print_error_message(f"Kokoro TTS error: {e}")
         return None
 
 
@@ -376,6 +423,18 @@ async def _speak_text(
     live: Live,
 ) -> bytes | None:
     """Synthesize and optionally play speech from text."""
+    if provider_config.tts_provider == "kokoro":
+        return await _stream_and_play_kokoro(
+            text=text,
+            kokoro_tts_config=kokoro_tts_config,
+            audio_output_config=audio_output_config,
+            logger=logger,
+            quiet=quiet,
+            play_audio_flag=play_audio_flag,
+            stop_event=stop_event,
+            live=live,
+        )
+
     synthesizer = get_synthesizer(
         provider_config,
         audio_output_config,

diff --git a/tests/test_services.py b/tests/test_services.py
@@ -133,39 +133,6 @@ def test_get_synthesizer_wyoming() -> None:
     assert synthesizer.func == tts._synthesize_speech_wyoming  # type: ignore[attr-defined]
 
 
-def test_get_synthesizer_kokoro() -> None:
-    """Test that get_synthesizer returns the Kokoro synthesizer."""
-    provider_config = config.ProviderSelection(
-        asr_provider="local",
-        llm_provider="local",
-        tts_provider="kokoro",
-    )
-    audio_output_config = config.AudioOutput(enable_tts=True)
-    wyoming_tts_config = config.WyomingTTS(
-        wyoming_tts_ip="localhost",
-        wyoming_tts_port=1234,
-    )
-    openai_tts_config = config.OpenAITTS(openai_tts_model="tts-1", openai_tts_voice="alloy")
-    openai_llm_config = config.OpenAILLM(
-        openai_llm_model="gpt-4o-mini",
-        openai_api_key="test_api_key",
-    )
-    kokoro_tts_cfg = config.KokoroTTS(
-        kokoro_tts_model="tts-1",
-        kokoro_tts_voice="alloy",
-        kokoro_tts_host="http://localhost:8000/v1",
-    )
-    synthesizer = tts.get_synthesizer(
-        provider_config,
-        audio_output_config,
-        wyoming_tts_config,
-        openai_tts_config,
-        openai_llm_config,
-        kokoro_tts_cfg,
-    )
-    assert synthesizer.func == tts._synthesize_speech_kokoro  # type: ignore[attr-defined]
-
-
 @pytest.mark.asyncio
 async def test_transcribe_audio_openai_no_key():
     """Test that transcribe_audio_openai fails without an API key."""