[RAPTOR-12838] add streaming to chat example (#1517)

ScottOglesby · web-flow · commit ab21429bc541 · 2025-06-25T11:32:37.000-07:00
diff --git a/model_templates/python3_dummy_chat/custom.py b/model_templates/python3_dummy_chat/custom.py
@@ -6,22 +6,19 @@
 """
 import calendar
 import time
-from typing import Iterator
+from typing import Any
+import uuid
 
 from openai.types.chat import ChatCompletion
 from openai.types.chat import ChatCompletionChunk
 from openai.types.chat import ChatCompletionMessage
 from openai.types.chat import CompletionCreateParams
 from openai.types.chat.chat_completion import Choice
+from openai.types.chat.chat_completion_chunk import Choice as ChunkChoice
+from openai.types.chat.chat_completion_chunk import ChoiceDelta
 from openai.types.model import Model
 
-from datarobot_drum import RuntimeParameters
-
-"""
-This example shows how to create a text generation model supporting OpenAI chat
-"""
-
-from typing import Any, Dict
+# This example shows how to create a text generation model supporting OpenAI chat
 
 
 def get_supported_llm_models(model: Any):
@@ -35,10 +32,11 @@ def get_supported_llm_models(model: Any):
     ----------
     model: a model ID to compare against; optional
 
-    Returns: list of openai.types.model.Model
+    Returns
     -------
-
+    List of openai.types.model.Model
     """
+    _ = model
     return [
         Model(
             id="datarobot_llm_id",
@@ -62,29 +60,62 @@ def load_model(code_dir: str) -> Any:
     -------
     If used, this hook must return a non-None value
     """
+    _ = code_dir
     return "dummy"
 
 
-def chat(
-    completion_create_params: CompletionCreateParams, model: Any
-) -> ChatCompletion | Iterator[ChatCompletionChunk]:
+def chat(completion_create_params: CompletionCreateParams, model: Any):
     """
-    This hook supports chat completions; see https://platform.openai.com/docs/api-reference/chat/create.
-    In this non-streaming example, the "LLM" echoes back the user's prompt,
-    acting as the model specified  in the chat completion request.
-
-    Parameters
-    ----------
-    completion_create_params: the chat completion request.
-    model: the deserialized model loaded by DRUM or by `load_model`, if supplied
-
-    Returns: a chat completion.
-    -------
-
+    This hook supports chat completions;
+    see https://platform.openai.com/docs/api-reference/chat/create.
+    In this example, the "LLM" echoes back the user's prompt,
+    acting as the model specified in the chat completion request.
+    If streaming is requested, yields ChatCompletionChunk objects
+    for each "token" (word) in the response.
+    Returns ChatCompletion or Iterator[ChatCompletionChunk]
     """
-    model = completion_create_params["model"]
+    _ = model
+    inter_token_latency_seconds = 0.25
+    model_id = completion_create_params["model"]
     message_content = "Echo: " + completion_create_params["messages"][0]["content"]
+    stream = completion_create_params.get("stream", False)
+
+    if stream:
+        # Mimic OpenAI streaming: yield one chunk at a time, split by whitespace
+        def gen_chunks():
+            chunk_id = str(uuid.uuid4())
+            for token in message_content.split():
+                yield ChatCompletionChunk(
+                    id=chunk_id,
+                    object="chat.completion.chunk",
+                    created=calendar.timegm(time.gmtime()),
+                    model=model_id,
+                    choices=[
+                        ChunkChoice(
+                            finish_reason=None,
+                            index=0,
+                            delta=ChoiceDelta(content=token),
+                        )
+                    ],
+                )
+                time.sleep(inter_token_latency_seconds)
+            # Send a final chunk with finish_reason
+            yield ChatCompletionChunk(
+                id=chunk_id,
+                object="chat.completion.chunk",
+                created=calendar.timegm(time.gmtime()),
+                model=model_id,
+                choices=[
+                    ChunkChoice(
+                        finish_reason="stop",
+                        index=0,
+                        delta=ChoiceDelta(),
+                    )
+                ],
+            )
 
+        return gen_chunks()
+    # non-streaming
     return ChatCompletion(
         id="association_id",
         choices=[
@@ -95,6 +126,6 @@ def chat(
             )
         ],
         created=calendar.timegm(time.gmtime()),
-        model=model,
+        model=model_id,
         object="chat.completion",
     )
diff --git a/tests/unit/datarobot_drum/drum/conftest.py b/tests/unit/datarobot_drum/drum/conftest.py
@@ -1,11 +1,17 @@
+import os
 from typing import Optional
-from unittest.mock import patch
+from unittest.mock import Mock, patch
 
+import httpx
 import pytest
-
-from datarobot_drum.drum.enum import CustomHooks
+from httpx import WSGITransport
+from openai import OpenAI
 
 from datarobot_drum.drum.adapters.model_adapters.python_model_adapter import PythonModelAdapter
+from datarobot_drum.drum.enum import CustomHooks, RunLanguage, TargetType
+from datarobot_drum.drum.lazy_loading.lazy_loading_handler import LazyLoadingHandler
+from datarobot_drum.drum.root_predictors.prediction_server import PredictionServer
+from datarobot_drum.drum.server import create_flask_app
 from tests.unit.datarobot_drum.drum.helpers import MODEL_ID_FROM_RUNTIME_PARAMETER
 from tests.unit.datarobot_drum.drum.helpers import inject_runtime_parameter
 from tests.unit.datarobot_drum.drum.helpers import unset_runtime_parameter
@@ -114,3 +120,46 @@ def llm_id_parameter():
     inject_runtime_parameter(parameter_name, MODEL_ID_FROM_RUNTIME_PARAMETER)
     yield
     unset_runtime_parameter(parameter_name)
+
+
+@pytest.fixture
+def test_flask_app():
+    with patch("datarobot_drum.drum.server.create_flask_app") as mock_create_flask_app, patch(
+        "datarobot_drum.drum.root_predictors.prediction_server.PredictionServer._run_flask_app"
+    ):
+        app = create_flask_app()
+        app.config.update(
+            {
+                "TESTING": True,
+            }
+        )
+
+        mock_create_flask_app.return_value = app
+
+        yield app
+
+
+@pytest.fixture
+def openai_client(test_flask_app):
+    return OpenAI(
+        base_url="http://localhost:8080",
+        api_key="<KEY>",
+        http_client=httpx.Client(transport=WSGITransport(app=test_flask_app)),
+    )
+
+
+@pytest.fixture
+def prediction_server(test_flask_app, chat_python_model_adapter):
+    _, _ = test_flask_app, chat_python_model_adapter  # depends on fixture side effects
+    with patch.dict(os.environ, {"TARGET_NAME": "target"}), patch(
+        "datarobot_drum.drum.language_predictors.python_predictor.python_predictor.PythonPredictor._init_mlops"
+    ), patch.object(LazyLoadingHandler, "download_lazy_loading_files"):
+        params = {
+            "run_language": RunLanguage.PYTHON,
+            "target_type": TargetType.TEXT_GENERATION,
+            "deployment_config": None,
+            "__custom_model_path__": "/non-existing-path-to-avoid-loading-unwanted-artifacts",
+        }
+        server = PredictionServer(params)
+        server._predictor._mlops = Mock()
+        server.materialize()
diff --git a/tests/unit/datarobot_drum/drum/test_model_templates.py b/tests/unit/datarobot_drum/drum/test_model_templates.py
@@ -0,0 +1,45 @@
+"""
+Copyright 2025 DataRobot, Inc. and its affiliates.
+All rights reserved.
+This is proprietary source code of DataRobot, Inc. and its affiliates.
+Released under the terms of DataRobot Tool and Utility Agreement.
+"""
+
+import pytest
+from openai import Stream
+from openai.types.chat import ChatCompletion
+
+# This module tests score and chat hooks from selected model templates;
+# not by direct function call, but via testing flask app, prediction server, and model adapter.
+# Rename the imported hooks to keep them distinct
+from model_templates.python3_dummy_chat.custom import chat as dummy_chat_chat
+
+# The particular model usually doesn't matter for the example hooks
+CHAT_COMPLETIONS_MODEL = "datarobot-deployed-llm"
+
+
+@pytest.mark.usefixtures("prediction_server")
+@pytest.mark.parametrize("is_streaming", [True, False])
+def test_dummy_chat_chat(openai_client, chat_python_model_adapter, is_streaming):
+    """Test the "python3 dummy chat" hook."""
+    chat_python_model_adapter.chat_hook = dummy_chat_chat
+    prompt = "Tell me a story"
+
+    completion = openai_client.chat.completions.create(
+        model=CHAT_COMPLETIONS_MODEL,
+        messages=[
+            {"role": "user", "content": prompt},
+        ],
+        stream=is_streaming,
+    )
+
+    if is_streaming:
+        assert isinstance(completion, Stream)
+        chunk_messages = [
+            chunk.choices[0].delta.content for chunk in completion if chunk.choices[0].delta.content
+        ]
+        expected_messages = ["Echo:"] + prompt.split()
+        assert chunk_messages == expected_messages
+    else:
+        assert isinstance(completion, ChatCompletion)
+        assert completion.choices[0].message.content == "Echo: " + prompt
diff --git a/tests/unit/datarobot_drum/drum/test_prediction_server.py b/tests/unit/datarobot_drum/drum/test_prediction_server.py
@@ -3,12 +3,10 @@
 from unittest.mock import ANY
 from unittest.mock import Mock, patch
 
-import httpx
 import openai
 import pytest
-from httpx import WSGITransport
 from openai import NotFoundError
-from openai import OpenAI, Stream
+from openai import Stream
 from openai.types.chat import (
     ChatCompletion,
 )
@@ -18,45 +16,11 @@
 from datarobot_drum.drum.enum import RunLanguage, TargetType
 from datarobot_drum.drum.lazy_loading.lazy_loading_handler import LazyLoadingHandler
 from datarobot_drum.drum.root_predictors.prediction_server import PredictionServer
-from datarobot_drum.drum.server import create_flask_app, HEADER_REQUEST_ID
-from datarobot_drum.drum.server import get_flask_app
+from datarobot_drum.drum.server import HEADER_REQUEST_ID
 from tests.unit.datarobot_drum.drum.chat_utils import create_completion, create_completion_chunks
 from tests.unit.datarobot_drum.drum.helpers import MODEL_ID_FROM_RUNTIME_PARAMETER
 
 
-@pytest.fixture
-def test_flask_app():
-    with patch("datarobot_drum.drum.server.create_flask_app") as mock_create_flask_app, patch(
-        "datarobot_drum.drum.root_predictors.prediction_server.PredictionServer._run_flask_app"
-    ):
-        app = create_flask_app()
-        app.config.update(
-            {
-                "TESTING": True,
-            }
-        )
-
-        mock_create_flask_app.return_value = app
-
-        yield app
-
-
-@pytest.fixture
-def prediction_server(test_flask_app, chat_python_model_adapter):
-    with patch.dict(os.environ, {"TARGET_NAME": "target"}), patch(
-        "datarobot_drum.drum.language_predictors.python_predictor.python_predictor.PythonPredictor._init_mlops"
-    ), patch.object(LazyLoadingHandler, "download_lazy_loading_files"):
-        params = {
-            "run_language": RunLanguage.PYTHON,
-            "target_type": TargetType.TEXT_GENERATION,
-            "deployment_config": None,
-            "__custom_model_path__": "/non-existing-path-to-avoid-loading-unwanted-artifacts",
-        }
-        server = PredictionServer(params)
-        server._predictor._mlops = Mock()
-        server.materialize()
-
-
 @pytest.fixture
 def list_models_prediction_server(test_flask_app, list_models_python_model_adapter):
     with patch.dict(os.environ, {"TARGET_NAME": "target"}), patch(
@@ -89,15 +53,6 @@ def non_textgen_prediction_server(test_flask_app, non_chat_python_model_adapter)
         server.materialize()
 
 
-@pytest.fixture
-def openai_client(test_flask_app):
-    return OpenAI(
-        base_url="http://localhost:8080",
-        api_key="<KEY>",
-        http_client=httpx.Client(transport=WSGITransport(app=test_flask_app)),
-    )
-
-
 @pytest.mark.usefixtures("prediction_server")
 def test_prediction_server(openai_client, chat_python_model_adapter):
     def chat_hook(completion_request, model):