diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh index 022c9a1cc0..6bfb13ee52 100755 --- a/prepare_llm_models.sh +++ b/prepare_llm_models.sh @@ -35,6 +35,7 @@ HERMES3_MODEL="NousResearch/Hermes-3-Llama-3.1-8B" PHI4_MODEL="microsoft/Phi-4-mini-instruct" MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3" GPT_OSS="openai/gpt-oss-20b" +DEVSTRAL_MODEL="unsloth/Devstral-Small-2507" if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi @@ -182,3 +183,14 @@ if [ ! -f "$1/$GPT_OSS/$TOKENIZER_FILE" ]; then echo "[ERROR] Models file $1/$GPT_OSS/$TOKENIZER_FILE does not exist." exit 1 fi + +if [ -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then + echo "Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE exists. Skipping downloading models." +else + mkdir -p $1/$DEVSTRAL_MODEL + convert_tokenizer $DEVSTRAL_MODEL --with_detokenizer -o $1/$DEVSTRAL_MODEL +fi +if [ ! -f "$1/$DEVSTRAL_MODEL/$TOKENIZER_FILE" ]; then + echo "[ERROR] Models file $1/$DEVSTRAL_MODEL/$TOKENIZER_FILE does not exist." + exit 1 +fi diff --git a/src/llm/BUILD b/src/llm/BUILD index bfe45b3036..ae37d936ca 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -137,6 +137,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/hermes3/tool_parser.hpp", "io_processing/llama3/tool_parser.hpp", "io_processing/phi4/tool_parser.hpp", + "io_processing/devstral/tool_parser.hpp", "io_processing/mistral/tool_parser.hpp", "io_processing/qwen3/reasoning_parser.hpp", "io_processing/gptoss/reasoning_parser.hpp", @@ -148,6 +149,7 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w "io_processing/hermes3/tool_parser.cpp", "io_processing/llama3/tool_parser.cpp", "io_processing/phi4/tool_parser.cpp", + "io_processing/devstral/tool_parser.cpp", "io_processing/mistral/tool_parser.cpp", "io_processing/qwen3/reasoning_parser.cpp", "io_processing/gptoss/reasoning_parser.cpp", @@ -176,11 +178,13 @@ ovms_cc_library( "io_processing/phi4/generation_config_builder.hpp", "io_processing/llama3/generation_config_builder.hpp", "io_processing/hermes3/generation_config_builder.hpp", + "io_processing/devstral/generation_config_builder.hpp", "io_processing/generation_config_builder.hpp"], srcs = ["io_processing/base_generation_config_builder.cpp", "io_processing/phi4/generation_config_builder.cpp", "io_processing/llama3/generation_config_builder.cpp", - "io_processing/hermes3/generation_config_builder.cpp"], + "io_processing/hermes3/generation_config_builder.cpp", + "io_processing/devstral/generation_config_builder.cpp"], deps = [ ":openai_request", "//src:libovmslogging", diff --git a/src/llm/io_processing/devstral/generation_config_builder.cpp b/src/llm/io_processing/devstral/generation_config_builder.cpp new file mode 100644 index 0000000000..f6dced3673 --- /dev/null +++ b/src/llm/io_processing/devstral/generation_config_builder.cpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include +#include +#include +#include + +#include "generation_config_builder.hpp" + +namespace ovms { + +void DevstralGenerationConfigBuilder::parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) { + // Call the base class method to fill in common configuration + BaseGenerationConfigBuilder::parseConfigFromRequest(request); + + // For now the only specific part is related to tools, so if there are no tools provided in the request + // we can exit early + if (request.toolNameSchemaMap.empty()) { + return; + } + + if (enableToolGuidedGeneration || request.toolChoice == "required") { + // Set tool guided generation config specific to Devstral model + auto triggeredTags = std::make_shared(); + triggeredTags->triggers.push_back("[TOOL_CALLS]"); + + for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { + const auto& toolSchema = toolSchemaWrapper.stringRepr; + ov::genai::StructuredOutputConfig::Tag tagItem; + tagItem.begin = "[TOOL_CALLS]" + toolName + "[ARGS]"; + tagItem.end = ""; + tagItem.content = ov::genai::StructuredOutputConfig::JSONSchema(toolSchema); + triggeredTags->tags.push_back(tagItem); + } + if (request.toolChoice == "required") { + triggeredTags->at_least_one = true; + } + ov::genai::StructuredOutputConfig::StructuralTag structuralTag = triggeredTags; + setStructuralTagsConfig(structuralTag); + } +} + +} // namespace ovms diff --git a/src/llm/io_processing/devstral/generation_config_builder.hpp b/src/llm/io_processing/devstral/generation_config_builder.hpp new file mode 100644 index 0000000000..97666f17d9 --- /dev/null +++ b/src/llm/io_processing/devstral/generation_config_builder.hpp @@ -0,0 +1,33 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once +#include "../base_generation_config_builder.hpp" + +namespace ovms { + +/* + * DevstralGenerationConfigBuilder extends BaseGenerationConfigBuilder to provide specific configuration for Devstral model. + * It overrides the parseConfigFromRequest method to set tool guided generation config. + */ +class DevstralGenerationConfigBuilder : public BaseGenerationConfigBuilder { +public: + DevstralGenerationConfigBuilder() = delete; + explicit DevstralGenerationConfigBuilder(const ov::genai::GenerationConfig& baseConfig, bool enableToolGuidedGeneration, DecodingMethod decodingMethod) : + BaseGenerationConfigBuilder(baseConfig, enableToolGuidedGeneration, decodingMethod) {} + + void parseConfigFromRequest(const OpenAIChatCompletionsRequest& request) override; +}; +} // namespace ovms diff --git a/src/llm/io_processing/devstral/tool_parser.cpp b/src/llm/io_processing/devstral/tool_parser.cpp new file mode 100644 index 0000000000..2274d2e2b0 --- /dev/null +++ b/src/llm/io_processing/devstral/tool_parser.cpp @@ -0,0 +1,230 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include +#include +#include +#include + +#include "src/port/rapidjson_document.hpp" +#include "src/logging.hpp" +#include "src/llm/io_processing/utils.hpp" +#include "src/stringutils.hpp" +#include "tool_parser.hpp" + +namespace ovms { + +void DevstralToolParser::parse(ParsedOutput& parsedOutput, const std::vector& generatedTokens) { + // expected format: [TOOL_CALLS]tool_name[ARGS]{"arg1": "value1", ...} + if (parsedOutput.content.empty() || generatedTokens.size() <= 0) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "No content to parse for tool calls"); + return; + } + size_t firstToolTokenIndex; + auto it = std::find(generatedTokens.begin(), generatedTokens.end(), this->botTokenId); + if (it != generatedTokens.end()) { + firstToolTokenIndex = std::distance(generatedTokens.begin(), it); + } else { + return; + } + + size_t firstArgsTokenIndex; + auto itArgs = std::find(generatedTokens.begin() + firstToolTokenIndex, generatedTokens.end(), this->argsTokenId); + if (itArgs != generatedTokens.end()) { + firstArgsTokenIndex = std::distance(generatedTokens.begin(), itArgs); + } else { + return; + } + if (firstToolTokenIndex > firstArgsTokenIndex) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "First tool token index is greater than first args token index."); + return; + } + std::vector toolNameTokens(generatedTokens.begin() + (firstToolTokenIndex + 1), generatedTokens.begin() + (firstArgsTokenIndex)); + std::vector argumentsTokens(generatedTokens.begin() + (firstArgsTokenIndex + 1), generatedTokens.end()); + + ToolCall toolCall; + std::string toolName = tokenizer.decode(toolNameTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); + std::string arguments = tokenizer.decode(argumentsTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); + ovms::trim(toolName); // trim in case of extra spaces/newlines + toolCall.name = toolName; + if (arguments.empty()) { + arguments = "{}"; // set empty arguments to {} + } + toolCall.arguments = arguments; + toolCall.id = generateRandomId(); // Generate a random ID for the tool call + parsedOutput.toolCalls.push_back(toolCall); + + // get subset of generatedTokens starting from begin() to firstArgsTokenIndex + std::vector contentTokens; + if (firstToolTokenIndex > 0) { + contentTokens = std::vector(generatedTokens.begin(), generatedTokens.begin() + firstToolTokenIndex); + parsedOutput.content = tokenizer.decode(contentTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); // Return only the content till tool call + } else { + parsedOutput.content = tokenizer.decode(contentTokens, ov::AnyMap{ov::genai::skip_special_tokens(true)}); + } + return; +} + +std::optional DevstralToolParser::sendFullDelta(ToolCall& toolCall) { + rapidjson::Document argsDelta; + argsDelta.Parse(toolCall.arguments.c_str()); + rapidjson::Document argumentsWrapper; + argumentsWrapper.SetObject(); + rapidjson::Document::AllocatorType& allocator = argumentsWrapper.GetAllocator(); + // now we need to add string toolCall.arguments to argumentsWrapper under "arguments" key + rapidjson::Value toolCallsString(rapidjson::kStringType); + toolCallsString.SetString(toolCall.arguments.c_str(), allocator); + argumentsWrapper.AddMember("arguments", toolCallsString, allocator); + auto currentDelta = wrapDelta(argumentsWrapper, this->toolCallIndex); + return currentDelta; +} + +rapidjson::Document DevstralToolParser::wrapCombinedDelta(ToolCall& toolCall) { + rapidjson::Document wrappedDelta; + wrappedDelta.SetObject(); + rapidjson::Value toolCalls(rapidjson::kArrayType); + rapidjson::Value toolCallObj(rapidjson::kObjectType); + rapidjson::Value idValue(generateRandomId().c_str(), wrappedDelta.GetAllocator()); + rapidjson::Value toolCallsString(rapidjson::kStringType); + + toolCallObj.AddMember("id", idValue, wrappedDelta.GetAllocator()); + toolCallObj.AddMember("type", "function", wrappedDelta.GetAllocator()); + toolCallObj.AddMember("index", toolCallIndex, wrappedDelta.GetAllocator()); + rapidjson::Value functionObj(rapidjson::kObjectType); + rapidjson::Value nameValue(toolCall.name.c_str(), wrappedDelta.GetAllocator()); + functionObj.AddMember("name", nameValue, wrappedDelta.GetAllocator()); + // now we need to add string toolCall.arguments to argumentsWrapper under "arguments" key + + toolCallsString.SetString(toolCall.arguments.c_str(), wrappedDelta.GetAllocator()); + functionObj.AddMember("arguments", toolCallsString, wrappedDelta.GetAllocator()); + toolCallObj.AddMember("function", functionObj, wrappedDelta.GetAllocator()); + toolCalls.PushBack(toolCallObj, wrappedDelta.GetAllocator()); + rapidjson::Value deltaWrapper(rapidjson::kObjectType); + deltaWrapper.AddMember("tool_calls", toolCalls, wrappedDelta.GetAllocator()); + wrappedDelta.AddMember("delta", deltaWrapper, wrappedDelta.GetAllocator()); + return wrappedDelta; +} + +rapidjson::Document DevstralToolParser::parseContentChunk() { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + writer.StartObject(); + writer.String("delta"); + writer.StartObject(); + writer.String("content"); + writer.String(streamContent.c_str()); + writer.EndObject(); + writer.EndObject(); + rapidjson::Document doc; + doc.Parse(buffer.GetString()); + streamContent.clear(); + return doc; +} + +std::optional DevstralToolParser::parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) { + /* + Devstral [TOOL_CALL]tool_name[ARGS]arguments[] + It does not support parallel tool calls, so tool calls are always in sequence. + + We have three processing states: + AWAITING_START_TAG, + AWAITING_ARGS_TAG, + PROCESSING_ARGS + + We store the history of chunks in streamContent string. After state changes are detected, we clear the streamContent to only keep unprocessed part. + */ + + this->streamContent += chunk; + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Chunk content: '{}', StreamContent: '{}', State: {}", chunk, this->streamContent, std::to_string(this->internalState)); + if (this->internalState == AWAITING_START_TAG) { + // if chunk ends with we need to remove it and return parsed content immediately + if (chunk.size() >= this->parsingEndTag.size() && + chunk.substr(chunk.size() - this->parsingEndTag.size()) == this->parsingEndTag) { + // remove from streamContent + this->streamContent = this->streamContent.substr(0, this->streamContent.size() - this->parsingEndTag.size()); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Found end tag in chunk while awaiting start tag. Returning content chunk."); + return parseContentChunk(); + } + size_t pos = chunk.find(this->parsingToolCallsStartTag); + if (pos != std::string::npos) { + this->internalState = AWAITING_ARGS_TAG; + this->toolCallIndex++; + if (pos == 0) { + this->streamContent.clear(); + return std::nullopt; + } else { + this->streamContent = this->streamContent.substr(pos + this->parsingToolCallsStartTag.length()); // "[TOOLS_CALLS]" length is 13 + return parseContentChunk(); + } + } else { + return parseContentChunk(); + } + } + if (this->internalState == AWAITING_ARGS_TAG) { + size_t pos = this->streamContent.find(this->parsingArgsStartTag); + if (pos != std::string::npos) { + this->internalState = PROCESSING_ARGS; + this->toolName = this->streamContent.substr(0, pos); + ovms::trim(this->toolName); // trim in case of extra spaces/newlines + this->streamContent = this->streamContent.substr(pos + this->parsingArgsStartTag.length()); + // check if chunk ends with , if yes, we need return full tool call delta + if (this->streamContent.size() >= this->parsingEndTag.size() && + this->streamContent.substr(this->streamContent.size() - this->parsingEndTag.size()) == this->parsingEndTag) { + // remove from streamContent + ToolCall toolCall; + toolCall.name = this->toolName; + this->streamContent = this->streamContent.substr(0, this->streamContent.size() - this->parsingEndTag.size()); + if (!this->streamContent.empty()) { + toolCall.arguments = this->streamContent; + } else { + toolCall.arguments = "{}"; + } + this->streamContent = ""; + return wrapCombinedDelta(toolCall); + } else { + return wrapFirstDelta(this->toolName, this->toolCallIndex); + } + } else { + return std::nullopt; + } + } + if (this->internalState == PROCESSING_ARGS) { + size_t endPos = this->streamContent.find(this->parsingEndTag); + std::string arguments; + if (endPos != std::string::npos) { + arguments = this->streamContent.substr(0, endPos); + } else { + arguments = this->streamContent; + } + + ToolCall toolCall; + if (!arguments.empty()) + toolCall.arguments = arguments; + else + toolCall.arguments = "{}"; + toolCall.name = this->toolName; + this->streamContent = ""; + return sendFullDelta(toolCall); + } + return std::nullopt; +} +// Static member definitions +const std::string DevstralToolParser::parsingArgsStartTag = "[ARGS]"; +const std::string DevstralToolParser::parsingToolCallsStartTag = "[TOOL_CALLS]"; +const std::string DevstralToolParser::parsingEndTag = ""; +const int64_t DevstralToolParser::argsTokenId = 32; // [ARGS] +const int64_t DevstralToolParser::botTokenId = 9; // [TOOL_CALLS] +} // namespace ovms diff --git a/src/llm/io_processing/devstral/tool_parser.hpp b/src/llm/io_processing/devstral/tool_parser.hpp new file mode 100644 index 0000000000..c07b38b34e --- /dev/null +++ b/src/llm/io_processing/devstral/tool_parser.hpp @@ -0,0 +1,80 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#include "src/port/rapidjson_document.hpp" +#include "src/llm/io_processing/base_output_parser.hpp" +#include "src/llm/io_processing/partial_json_builder.hpp" +#include "src/llm/apis/tool_schema_wrapper.hpp" + +namespace ovms { +class DevstralToolParser : public BaseOutputParser { + static const int64_t argsTokenId; // [ARGS] + static const int64_t botTokenId; // [TOOL_CALLS] + + // in streaming mode we can rely on tags in string format as tokens are not available + static const std::string parsingArgsStartTag; + static const std::string parsingToolCallsStartTag; + static const std::string parsingEndTag; + + enum InternalState { + AWAITING_START_TAG, + AWAITING_ARGS_TAG, + PROCESSING_ARGS + }; + + InternalState internalState = AWAITING_START_TAG; + const ToolsSchemas_t& toolSchemas; + // Index to track the current tool call being processed (-1 means no tool call has been started yet) + int toolCallIndex = -1; + std::string streamContent = ""; // content accumulated from stream chunks + std::string toolName = ""; + std::optional sendFullDelta(ToolCall& toolCall); + +public: + DevstralToolParser() = delete; + DevstralToolParser(ov::genai::Tokenizer& tokenizer, const ToolsSchemas_t& toolSchemas) : + BaseOutputParser(tokenizer), + toolSchemas(toolSchemas) {} + + void parse(ParsedOutput& parsedOutput, const std::vector& generatedTokens) override; + std::optional parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override; + rapidjson::Document parseContentChunk(); + rapidjson::Document wrapCombinedDelta(ToolCall& toolCall); + const std::vector& getParsingStartTags() const override { + static const std::vector toolCallStartTags{parsingToolCallsStartTag}; + return toolCallStartTags; + } + const std::vector& getSpecialParsingStartTags() const override { + static const std::vector specialParsingStartTags{}; + return specialParsingStartTags; + } + // Tools calls are expected to be the last part of the content, so we do not specify an end tag. + const std::string& getParsingEndTag() const override { + return this->parsingEndTag; + } + + bool requiresStreamingWithSpecialTokens() const override { + return true; + } +}; + +} // namespace ovms diff --git a/src/llm/io_processing/generation_config_builder.hpp b/src/llm/io_processing/generation_config_builder.hpp index 663d4a9b1a..2423cd074d 100644 --- a/src/llm/io_processing/generation_config_builder.hpp +++ b/src/llm/io_processing/generation_config_builder.hpp @@ -24,6 +24,7 @@ #include "phi4/generation_config_builder.hpp" #include "llama3/generation_config_builder.hpp" #include "hermes3/generation_config_builder.hpp" +#include "devstral/generation_config_builder.hpp" #include "../apis/openai_request.hpp" #include "../../logging.hpp" @@ -44,6 +45,8 @@ class GenerationConfigBuilder { builder_impl = std::make_unique(baseConfig, enableToolGuidedGeneration, decodingMethod); } else if (toolParserName == "phi4") { builder_impl = std::make_unique(baseConfig, enableToolGuidedGeneration, decodingMethod); + } else if (toolParserName == "devstral") { + builder_impl = std::make_unique(baseConfig, enableToolGuidedGeneration, decodingMethod); } else { if (enableToolGuidedGeneration) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Option enable_tool_guided_generation is set, but will not be effective since no valid tool parser has been provided."); diff --git a/src/llm/io_processing/output_parser.cpp b/src/llm/io_processing/output_parser.cpp index cf0a805f59..1c060375df 100644 --- a/src/llm/io_processing/output_parser.cpp +++ b/src/llm/io_processing/output_parser.cpp @@ -27,6 +27,7 @@ #include "gptoss/tool_parser.hpp" #include "qwen3/reasoning_parser.hpp" #include "qwen3coder/qwen3coder_tool_parser.hpp" +#include "devstral/tool_parser.hpp" #include "gptoss/reasoning_parser.hpp" namespace ovms { @@ -168,6 +169,8 @@ OutputParser::OutputParser(ov::genai::Tokenizer& tokenizer, const std::string to toolParser = std::make_unique(tokenizer); } else if (toolParserName == "qwen3coder") { toolParser = std::make_unique(tokenizer, toolNameSchemaMap); + } else if (toolParserName == "devstral") { + toolParser = std::make_unique(tokenizer, toolNameSchemaMap); } else if (!toolParserName.empty()) { throw std::runtime_error("Unsupported tool parser: " + toolParserName); } diff --git a/src/llm/io_processing/output_parser.hpp b/src/llm/io_processing/output_parser.hpp index 613e0a993e..4b5d1c0420 100644 --- a/src/llm/io_processing/output_parser.hpp +++ b/src/llm/io_processing/output_parser.hpp @@ -87,8 +87,13 @@ class OutputParser { std::optional parseChunk(const std::string& chunkResponse, const bool toolsAvailable, ov::genai::GenerationFinishReason finishReason); bool requiresStreamingWithSpecialTokens() const { - return (reasoningParser && reasoningParser->requiresStreamingWithSpecialTokens()) && - (toolParser && toolParser->requiresStreamingWithSpecialTokens()); + if (!reasoningParser) { + return toolParser && toolParser->requiresStreamingWithSpecialTokens(); + } else if (!toolParser) { + return reasoningParser && reasoningParser->requiresStreamingWithSpecialTokens(); + } else { + return (reasoningParser && reasoningParser->requiresStreamingWithSpecialTokens()) && (toolParser && toolParser->requiresStreamingWithSpecialTokens()); + } } }; } // namespace ovms diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 345d1c362b..428d28762e 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -103,12 +103,17 @@ absl::Status GenAiServable::processTokenizeRequest(std::shared_ptr& executionContext) { - executionContext->apiHandler = std::make_shared(*executionContext->payload.parsedJson, - executionContext->endpoint, - std::chrono::system_clock::now(), - getProperties()->tokenizer, - getProperties()->toolParserName, - getProperties()->reasoningParserName); + try { + executionContext->apiHandler = std::make_shared(*executionContext->payload.parsedJson, + executionContext->endpoint, + std::chrono::system_clock::now(), + getProperties()->tokenizer, + getProperties()->toolParserName, + getProperties()->reasoningParserName); + } catch (const std::exception& e) { + SPDLOG_LOGGER_ERROR(llm_calculator_logger, "Failed to create API handler: {}", e.what()); + return absl::InvalidArgumentError(std::string("Failed to create API handler: ") + e.what()); + } auto& config = ovms::Config::instance(); auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath); diff --git a/src/test/llm/output_parsers/devstral_output_parser_test.cpp b/src/test/llm/output_parsers/devstral_output_parser_test.cpp new file mode 100644 index 0000000000..ca61b5c2cc --- /dev/null +++ b/src/test/llm/output_parsers/devstral_output_parser_test.cpp @@ -0,0 +1,372 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include +#include +#include +#include + +#include "src/llm/io_processing/base_output_parser.hpp" +#include "src/llm/io_processing/output_parser.hpp" +#include "test/platform_utils.hpp" + +using namespace ovms; + +#ifdef _WIN32 +const std::string tokenizerPath = getWindowsRepoRootPath() + "\\src\\test\\llm_testing\\unsloth\\Devstral-Small-2507"; +#else +// Hardcoded for usage in docker container +const std::string tokenizerPath = "/ovms/src/test/llm_testing/unsloth/Devstral-Small-2507/"; +#endif + +static ovms::ToolsSchemas_t EMPTY_TOOLS_SCHEMA = {}; // not used for mistral +static std::unique_ptr devstralTokenizer; + +class DevstralOutputParserTest : public ::testing::Test { +protected: + std::unique_ptr outputParserWithRegularToolParsing; + + static void SetUpTestSuite() { + try { + devstralTokenizer = std::make_unique(tokenizerPath); + } catch (const std::exception& e) { + FAIL() << "Failed to initialize devstral tokenizer: " << e.what(); + } catch (...) { + FAIL() << "Failed to initialize devstral tokenizer due to unknown error."; + } + } + + static void TearDownTestSuite() { + devstralTokenizer.reset(); + } + + void SetUp() override { + // declare tools_schema + static std::map toolSchemasInput = { + {"example_tool", R"({"properties": {"arg1": {"type": "string", "description": "A string argument."}}, "required": ["arg1"]})"}, + }; + + static std::vector> schemaDocsStorage; + + auto convertStringToolSchemasStringToToolsSchemas = []( + const std::map& input) -> ToolsSchemas_t { + ToolsSchemas_t result; + schemaDocsStorage.clear(); + for (const auto& [name, schemaStr] : input) { + auto schemaDoc = std::make_unique(); + if (schemaDoc->Parse(schemaStr.c_str()).HasParseError()) { + throw std::runtime_error("Failed to parse schema for tool: " + name); + } + result[name] = {schemaDoc.get(), schemaStr}; + schemaDocsStorage.push_back(std::move(schemaDoc)); + } + return result; + }; + + static ovms::ToolsSchemas_t toolsSchemas = convertStringToolSchemasStringToToolsSchemas(toolSchemasInput); + outputParserWithRegularToolParsing = std::make_unique(*devstralTokenizer, "devstral", "", toolsSchemas); + } +}; + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithSingleToolCall) { + std::string input = "[TOOL_CALLS]example_tool[ARGS]{\"arg1\":\"value1 with new line \\n and \"quote\" and slash \\ \",\"arg2\":42}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, ""); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"arg1\":\"value1 with new line \\n and \"quote\" and slash \\ \",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithSingleToolCall_MissingEndTag) { + std::string testInput = "Reasoning before tool call [TOOL_CALLS] example_tool [ARGS]{\"arg1\":\"value1\",\"arg2\":42}"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoning before tool call "); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithSingleToolCall_EmptyArguments) { + std::string testInput = "Reasoning before tool call [TOOL_CALLS]example_tool[ARGS]"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoning before tool call "); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithContentAndNoToolCalls) { + std::string input = "This is a regular model response without tool calls."; + auto generatedTensor = devstralTokenizer->encode(input, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "This is a regular model response without tool calls."); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); + EXPECT_EQ(parsedOutput.reasoning, ""); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithContentAndSingleToolCall) { + std::string testInput = "Reasoning before tool call [TOOL_CALLS]example_tool[ARGS]{\"arg1\":\"value1\",\"arg2\":42}"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoning before tool call "); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithInvalidOrder) { + std::string testInput = "Reasoning before tool call [ARGS]example_tool[TOOL_CALLS]{\"arg1\":\"value1\",\"arg2\":42}"; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, "Reasoning before tool call example_tool{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithMissingArgsTag) { + std::string input = "Some content [TOOL_CALLS]example_tool{\"arg1\":\"value1\",\"arg2\":42}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + // Same expected content as tokenizer does not add special tokens + EXPECT_EQ(parsedOutput.content, "Some content example_tool{\"arg1\":\"value1\",\"arg2\":42}"); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 0); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithArrayArguments) { + std::string input = "[TOOL_CALLS]example_tool[ARGS]{\"filepath\":\"/var/log/db.log\",\"status\":[\"completed\",\"failed\"],\"encoding\":\"utf-8\",\"processFunction\":\"processFunction\"}"; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, ""); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{\"filepath\":\"/var/log/db.log\",\"status\":[\"completed\",\"failed\"],\"encoding\":\"utf-8\",\"processFunction\":\"processFunction\"}"); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, ParseToolCallOutputWithInvalidArguments) { + std::string input = "[TOOL_CALLS]example_tool[ARGS]{ \"filepath\": \"/var/log/db.log\", \"status\": "; + std::string testInput = input; + auto generatedTensor = devstralTokenizer->encode(testInput, ov::genai::add_special_tokens(false)).input_ids; + std::vector generatedTokens(generatedTensor.data(), generatedTensor.data() + generatedTensor.get_size()); + ParsedOutput parsedOutput = outputParserWithRegularToolParsing->parse(generatedTokens, true); + EXPECT_EQ(parsedOutput.content, ""); + EXPECT_EQ(parsedOutput.reasoning, ""); + ASSERT_EQ(parsedOutput.toolCalls.size(), 1); + EXPECT_EQ(parsedOutput.toolCalls[0].name, "example_tool"); + EXPECT_EQ(parsedOutput.toolCalls[0].arguments, "{ \"filepath\": \"/var/log/db.log\", \"status\": "); + EXPECT_EQ(parsedOutput.toolCalls[0].id.empty(), false); +} + +TEST_F(DevstralOutputParserTest, HolisticStreaming) { + std::vector>> chunkToDeltaVec{ + // Tool call phase + // Starting first tool. Collecting chunk until full name is received. Don't return until then. + {"Reasoning", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"content":"Reasoning"}})"}, + {"example", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"content":"example"}})"}, + {"[TOOL_CALLS]", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {" get", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"_", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"weather", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {" [ARGS]", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"id":"XXXXXXXXX","type":"function","index":0,"function":{"name":"get_weather"}}]}})"}, + {"{\"", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\""}}]}})"}, + {"city\":", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"city\":"}}]}})"}, + {" \"Paris", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":" \"Paris"}}]}})"}, + {" \"capital of ", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":" \"capital of "}}]}})"}, + {"art\\vine \\n", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"art\\vine \\n"}}]}})"}, + // Last chunk is added in the for loop below + }; + ToolsSchemas_t tools_schemas = { + {"get_weather", ToolSchemaWrapper{}}}; + for (auto lastFinishReason : {ov::genai::GenerationFinishReason::STOP, ov::genai::GenerationFinishReason::LENGTH}) { + // Need to have new output parser per case to simulate separate request processing + outputParserWithRegularToolParsing = std::make_unique(*devstralTokenizer, "devstral", "", tools_schemas); + auto chunkToDeltaVecCopy = chunkToDeltaVec; + if (lastFinishReason == ov::genai::GenerationFinishReason::STOP) { + chunkToDeltaVecCopy.push_back({"\"}", ov::genai::GenerationFinishReason::STOP, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\"}"}}]}})"}); + } else { + chunkToDeltaVecCopy.push_back({"\"}", ov::genai::GenerationFinishReason::LENGTH, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\"}"}}]}})"}); + } + int64_t chunkIteration = -1; + for (const auto& [chunk, finishReason, expectedDelta] : chunkToDeltaVecCopy) { + chunkIteration++; + std::optional doc = outputParserWithRegularToolParsing->parseChunk(chunk, true, finishReason); + if (!expectedDelta.has_value() && !doc.has_value()) { + continue; // Both are nullopt, OK + } + if (expectedDelta.has_value() && doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + // If both strings contain "id":"...", compare id values by length and alphanumeric, else compare whole strings + std::string expected = expectedDelta.value(); + std::string idKey = "\"id\":\""; + auto docIdPos = docStr.find(idKey); + auto expectedIdPos = expected.find(idKey); + if (docIdPos != std::string::npos && expectedIdPos != std::string::npos) { + auto docIdStart = docIdPos + idKey.size(); + auto docIdEnd = docStr.find("\"", docIdStart); + auto expectedIdStart = expectedIdPos + idKey.size(); + auto expectedIdEnd = expected.find("\"", expectedIdStart); + ASSERT_NE(docIdEnd, std::string::npos); + ASSERT_NE(expectedIdEnd, std::string::npos); + std::string docId = docStr.substr(docIdStart, docIdEnd - docIdStart); + std::string expectedId = expected.substr(expectedIdStart, expectedIdEnd - expectedIdStart); + EXPECT_EQ(docId.size(), expectedId.size()) << "ID length mismatch for chunk: " << chunk; + EXPECT_TRUE(std::all_of(docId.begin(), docId.end(), ::isalnum)) << "ID not alphanumeric for chunk: " << chunk; + // Compare everything except the id value + std::string docStrNoId = docStr; + std::string expectedNoId = expected; + docStrNoId.replace(docIdStart, docId.size(), std::string(docId.size(), '*')); + expectedNoId.replace(expectedIdStart, expectedId.size(), std::string(expectedId.size(), '*')); + EXPECT_EQ(docStrNoId, expectedNoId) << "Mismatch for chunk (ignoring id value): " << chunk; + } else { + EXPECT_EQ(docStr, expected) << "Mismatch for chunk: [" << chunk << "] got [" << docStr << "] but expected [" << expected << "]" << chunkIteration; + } + } else if (expectedDelta.has_value()) { + FAIL() << "Mismatch for chunk: [" << chunk << "] got nothing but expected [" << expectedDelta.value() << "]" << chunkIteration; + } else if (doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + FAIL() << "Mismatch for chunk: [" << chunk << "] expected nothing but got [" << docStr << "]" << chunkIteration; + } else { + FAIL() << "Mismatch for chunk: [" << chunk << "] " << chunkIteration; + } + } + } +} + +TEST_F(DevstralOutputParserTest, EmptyArgumentsStreaming) { + std::vector>> chunkToDeltaVec{ + // Tool call phase + // Starting first tool. Collecting chunk until full name is received. Don't return until then. + {"[TOOL_CALLS]", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"list", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"_", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"tools", ov::genai::GenerationFinishReason::NONE, std::nullopt}, + {"[ARGS]", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"id":"XXXXXXXXX","type":"function","index":0,"function":{"name":"list_tools"}}]}})"}, + {"", ov::genai::GenerationFinishReason::NONE, R"({"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{}"}}]}})"}, + }; + ToolsSchemas_t tools_schemas = { + {"list_tools", ToolSchemaWrapper{}}}; + + int64_t chunkIteration = 0; + for (const auto& [chunk, finishReason, expectedDelta] : chunkToDeltaVec) { + chunkIteration++; + std::optional doc = outputParserWithRegularToolParsing->parseChunk(chunk, true, finishReason); + if (!expectedDelta.has_value() && !doc.has_value()) { + continue; // Both are nullopt, OK + } + if (expectedDelta.has_value() && doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + // If both strings contain "id":"...", compare id values by length and alphanumeric, else compare whole strings + std::string expected = expectedDelta.value(); + std::string idKey = "\"id\":\""; + auto docIdPos = docStr.find(idKey); + auto expectedIdPos = expected.find(idKey); + if (docIdPos != std::string::npos && expectedIdPos != std::string::npos) { + auto docIdStart = docIdPos + idKey.size(); + auto docIdEnd = docStr.find("\"", docIdStart); + auto expectedIdStart = expectedIdPos + idKey.size(); + auto expectedIdEnd = expected.find("\"", expectedIdStart); + ASSERT_NE(docIdEnd, std::string::npos); + ASSERT_NE(expectedIdEnd, std::string::npos); + std::string docId = docStr.substr(docIdStart, docIdEnd - docIdStart); + std::string expectedId = expected.substr(expectedIdStart, expectedIdEnd - expectedIdStart); + EXPECT_EQ(docId.size(), expectedId.size()) << "ID length mismatch for chunk: " << chunk; + EXPECT_TRUE(std::all_of(docId.begin(), docId.end(), ::isalnum)) << "ID not alphanumeric for chunk: " << chunk; + // Compare everything except the id value + std::string docStrNoId = docStr; + std::string expectedNoId = expected; + docStrNoId.replace(docIdStart, docId.size(), std::string(docId.size(), '*')); + expectedNoId.replace(expectedIdStart, expectedId.size(), std::string(expectedId.size(), '*')); + EXPECT_EQ(docStrNoId, expectedNoId) << "Mismatch for chunk (ignoring id value): " << chunk; + } else { + EXPECT_EQ(docStr, expected) << "Mismatch for chunk: [" << chunk << "] got [" << docStr << "] but expected [" << expected << "]" << chunkIteration; + } + } else if (expectedDelta.has_value()) { + FAIL() << "Mismatch for chunk: [" << chunk << "] got nothing but expected [" << expectedDelta.value() << "]" << chunkIteration; + } else if (doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + FAIL() << "Mismatch for chunk: [" << chunk << "] expected nothing but got [" << docStr << "]" << chunkIteration; + } else { + FAIL() << "Mismatch for chunk: [" << chunk << "] " << chunkIteration; + } + } +} + +TEST_F(DevstralOutputParserTest, ToolCallsWithoutToolsInTheRequestStreaming) { + std::vector>> chunkToDeltaVec{ + // Tool parser is available, but tools are not in the request so every chunk is just a regular content + {"[TOOL_CALLS]", "{\"delta\":{\"content\":\"[TOOL_CALLS]\"}}"}, + {"get_", "{\"delta\":{\"content\":\"get_\"}}"}, + {"weather", "{\"delta\":{\"content\":\"weather\"}}"}, + {"[ARGS]", "{\"delta\":{\"content\":\"[ARGS]\"}}"}, + {"{\"", "{\"delta\":{\"content\":\"{\\\"\"}}"}, + {"city\":", "{\"delta\":{\"content\":\"city\\\":\"}}"}, + {"\"Paris\"", "{\"delta\":{\"content\":\"\\\"Paris\\\"\"}}"}, + {"}", "{\"delta\":{\"content\":\"}\"}}"}, + }; + + for (const auto& [chunk, expectedDelta] : chunkToDeltaVec) { + // Second argument is false as we simulate the case where tools have not been provided in the request + std::optional doc = outputParserWithRegularToolParsing->parseChunk(chunk, false, ov::genai::GenerationFinishReason::NONE); + if (!expectedDelta.has_value() && !doc.has_value()) { + continue; // Both are nullopt, OK + } + if (expectedDelta.has_value() && doc.has_value()) { + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + doc->Accept(writer); + std::string docStr = buffer.GetString(); + std::string expected = expectedDelta.value(); + EXPECT_EQ(docStr, expected) << "Mismatch for chunk: " << chunk; + } else { + FAIL() << "Mismatch between expectedDelta and doc for chunk: " << chunk; + } + } +} diff --git a/windows_prepare_llm_models.bat b/windows_prepare_llm_models.bat index 88c6d04a17..de2512fe1a 100644 --- a/windows_prepare_llm_models.bat +++ b/windows_prepare_llm_models.bat @@ -33,8 +33,6 @@ set "RERANK_MODEL=BAAI/bge-reranker-base" set "TEXT_GENERATION_MODEL=HuggingFaceTB/SmolLM2-360M-Instruct" set "FACEBOOK_MODEL=facebook/opt-125m" set "VLM_MODEL=OpenGVLab/InternVL2-1B" -set "TOKENIZER_FILE=openvino_tokenizer.bin" -set "LEGACY_MODEL_FILE=1\model.bin" :: Models for tools testing. Only tokenizers are downloaded. set "QWEN3_MODEL=Qwen/Qwen3-8B" @@ -43,6 +41,7 @@ set "HERMES3_MODEL=NousResearch/Hermes-3-Llama-3.1-8B" set "PHI4_MODEL=microsoft/Phi-4-mini-instruct" set "MISTRAL_MODEL=mistralai/Mistral-7B-Instruct-v0.3" set "GPTOSS_MODEL=openai/gpt-oss-20b" +set "DEVSTRAL_MODEL=unsloth/Devstral-Small-2507" echo Downloading LLM testing models to directory %~1 set "PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly" @@ -51,7 +50,6 @@ C:\opt\Python312\python.exe -m venv .venv if !errorlevel! neq 0 exit /b !errorlevel! call .\.venv\Scripts\Activate.bat if !errorlevel! neq 0 exit /b !errorlevel! -set python -m pip install --upgrade pip if !errorlevel! neq 0 exit /b !errorlevel! pip install -U -r demos\common\export_models\requirements.txt @@ -59,160 +57,63 @@ if !errorlevel! neq 0 exit /b !errorlevel! if not exist "%~1" mkdir "%~1" -if exist "%~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading text generation model to %~1\%TEXT_GENERATION_MODEL% directory. - python demos\common\export_models\export_model.py text_generation --source_model "%TEXT_GENERATION_MODEL%" --weight-format int8 --model_repository_path %~1 - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%TEXT_GENERATION_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) -if exist "%~1\%FACEBOOK_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%FACEBOOK_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading text generation model to %~1\%FACEBOOK_MODEL% directory. - python demos\common\export_models\export_model.py text_generation --source_model "%FACEBOOK_MODEL%" --weight-format int8 --model_repository_path %~1 - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%FACEBOOK_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%FACEBOOK_MODEL%\%TOKENIZER_FILE% does not exist. - exit /b 1 -) +:: Export models +call :download_export_model "%VLM_MODEL%" "text_generation" "--weight-format int4" "%~1" +call :download_export_model "%TEXT_GENERATION_MODEL%" "text_generation" "--weight-format int8" "%~1" +call :download_export_model "%FACEBOOK_MODEL%" "text_generation" "--weight-format int8" "%~1" +call :download_export_model "%RERANK_MODEL%" "rerank_ov" "--weight-format int8 --model_name %RERANK_MODEL%\ov" "%~1" +call :download_export_model "%EMBEDDING_MODEL%" "embeddings_ov" "--weight-format int8 --model_name %EMBEDDING_MODEL%\ov" "%~1" -if not exist "%~1\%TEXT_GENERATION_MODEL%\chat_template.jinja" ( - echo Copying dummy chat template to %TEXT_GENERATION_MODEL% model directory. - copy /Y "src\test\llm\dummy_facebook_template.jinja" "%~1\%TEXT_GENERATION_MODEL%\chat_template.jinja" +if not exist "%~1\%FACEBOOK_MODEL%\chat_template.jinja" ( + echo Copying dummy chat template to %FACEBOOK_MODEL% model directory. + copy /Y "src\test\llm\dummy_facebook_template.jinja" "%~1\%FACEBOOK_MODEL%\chat_template.jinja" if !errorlevel! neq 0 exit /b !errorlevel! ) -if exist "%~1\%EMBEDDING_MODEL%\ov\%TOKENIZER_FILE%" ( - echo Models file %~1\%EMBEDDING_MODEL%\ov\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading embeddings model to %~1\%EMBEDDING_MODEL%\ov directory. - python demos\common\export_models\export_model.py embeddings_ov --source_model "%EMBEDDING_MODEL%" --weight-format int8 --model_repository_path %~1 --model_name "%EMBEDDING_MODEL%\ov" - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%EMBEDDING_MODEL%\ov\%TOKENIZER_FILE%" ( - echo Models file %~1\%EMBEDDING_MODEL%\ov\%TOKENIZER_FILE% does not exists. - exit /b 1 -) - -if exist "%~1\%RERANK_MODEL%\rerank\%LEGACY_MODEL_FILE%" ( - echo Models file %~1\%RERANK_MODEL%\rerank\%LEGACY_MODEL_FILE% exists. Skipping downloading models. -) else ( - echo Downloading rerank model to %~1\%RERANK_MODEL% directory. - python demos\common\export_models\export_model.py rerank --source_model "%RERANK_MODEL%" --weight-format int8 --model_repository_path %~1 - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%RERANK_MODEL%\rerank\%LEGACY_MODEL_FILE%" ( - echo Models file %~1\%RERANK_MODEL%\rerank\%LEGACY_MODEL_FILE% does not exists. - exit /b 1 -) - -if exist "%~1\%RERANK_MODEL%\ov\%TOKENIZER_FILE%" ( - echo Models file %~1\%RERANK_MODEL%\ov\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading rerank model to %~1\%RERANK_MODEL%\ov directory. - python demos\common\export_models\export_model.py rerank_ov --source_model "%RERANK_MODEL%" --weight-format int8 --model_repository_path %~1 --model_name "%RERANK_MODEL%\ov" - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%RERANK_MODEL%\ov\%TOKENIZER_FILE%" ( - echo Models file %~1\%RERANK_MODEL%\ov\%TOKENIZER_FILE% does not exists. - exit /b 1 -) - -if exist "%~1\%VLM_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%VLM_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading visual language model to %~1\%VLM_MODEL% directory. - python demos\common\export_models\export_model.py text_generation --source_model "%VLM_MODEL%" --weight-format int4 --kv_cache_precision u8 --model_repository_path %~1 - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%VLM_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%VLM_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) - -if exist "%~1\%QWEN3_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%QWEN3_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading tokenizer and detokenizer for Qwen3 model to %~1\%QWEN3_MODEL% directory. - mkdir "%~1\%QWEN3_MODEL%" - convert_tokenizer "%QWEN3_MODEL%" --with_detokenizer -o "%~1\%QWEN3_MODEL%" - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%QWEN3_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%QWEN3_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) +:: Download tokenizers for tools testing +call :download_tokenizer "%QWEN3_MODEL%" "%~1\%QWEN3_MODEL%" +call :download_tokenizer "%LLAMA3_MODEL%" "%~1\%LLAMA3_MODEL%" +call :download_tokenizer "%HERMES3_MODEL%" "%~1\%HERMES3_MODEL%" +call :download_tokenizer "%PHI4_MODEL%" "%~1\%PHI4_MODEL%" +call :download_tokenizer "%MISTRAL_MODEL%" "%~1\%MISTRAL_MODEL%" +call :download_tokenizer "%GPTOSS_MODEL%" "%~1\%GPTOSS_MODEL%" +call :download_tokenizer "%DEVSTRAL_MODEL%" "%~1\%DEVSTRAL_MODEL%" -if exist "%~1\%LLAMA3_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%LLAMA3_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading tokenizer and detokenizer for Llama3.1 model to %~1\%LLAMA3_MODEL% directory. - mkdir "%~1\%LLAMA3_MODEL%" - convert_tokenizer "%LLAMA3_MODEL%" --with_detokenizer -o "%~1\%LLAMA3_MODEL%" - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%LLAMA3_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%LLAMA3_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) +exit /b 0 -if exist "%~1\%HERMES3_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%HERMES3_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading tokenizer and detokenizer for Hermes3 model to %~1\%HERMES3_MODEL% directory. - mkdir "%~1\%HERMES3_MODEL%" - convert_tokenizer "%HERMES3_MODEL%" --with_detokenizer -o "%~1\%HERMES3_MODEL%" - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%HERMES3_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%HERMES3_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) +:: Helper subroutine to download export models +:download_export_model +set "model=%~1" +set "model_type=%~2" +set "export_args=%~3" +set "repository=%~4" -if exist "%~1\%PHI4_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%PHI4_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. +if not exist "%repository%\%model%\openvino_tokenizer.bin" ( + echo Downloading %model_type% model to %repository%\%model% directory. + python demos\common\export_models\export_model.py %model_type% --source_model "%model%" %export_args% --model_repository_path %repository% ) else ( - echo Downloading tokenizer and detokenizer for Phi-4 model to %~1\%PHI4_MODEL% directory. - mkdir "%~1\%PHI4_MODEL%" - convert_tokenizer "%PHI4_MODEL%" --with_detokenizer -o "%~1\%PHI4_MODEL%" - if !errorlevel! neq 0 exit /b !errorlevel! + echo Models file %repository%\%model%\openvino_tokenizer.bin exists. Skipping downloading models. ) -if not exist "%~1\%PHI4_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%PHI4_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) +exit /b 0 -if exist "%~1\%MISTRAL_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%MISTRAL_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. -) else ( - echo Downloading tokenizer and detokenizer for Mistral model to %~1\%MISTRAL_MODEL% directory. - mkdir "%~1\%MISTRAL_MODEL%" - convert_tokenizer "%MISTRAL_MODEL%" --with_detokenizer -o "%~1\%MISTRAL_MODEL%" - if !errorlevel! neq 0 exit /b !errorlevel! -) -if not exist "%~1\%MISTRAL_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%MISTRAL_MODEL%\%TOKENIZER_FILE% does not exists. - exit /b 1 -) +:: Helper subroutine to download tokenizers +:download_tokenizer +set "model=%~1" +set "check_path=%~2" -if exist "%~1\%GPTOSS_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%GPTOSS_MODEL%\%TOKENIZER_FILE% exists. Skipping downloading models. +if exist "%check_path%" ( + echo Models file %check_path% exists. Skipping downloading models. ) else ( - echo Downloading tokenizer and detokenizer for GPT-OSS model to %~1\%GPTOSS_MODEL% directory. - mkdir "%~1\%GPTOSS_MODEL%" - convert_tokenizer "%GPTOSS_MODEL%" --with_detokenizer -o "%~1\%GPTOSS_MODEL%" + echo Downloading tokenizer and detokenizer for %model% model to %check_path% directory. + mkdir "%check_path%" + convert_tokenizer "%model%" --with_detokenizer -o "%check_path%" if !errorlevel! neq 0 exit /b !errorlevel! ) -if not exist "%~1\%GPTOSS_MODEL%\%TOKENIZER_FILE%" ( - echo Models file %~1\%GPTOSS_MODEL%\%TOKENIZER_FILE% does not exists. +if not exist "%check_path%\openvino_tokenizer.bin" ( + echo Models file %check_path%\openvino_tokenizer.bin does not exist. exit /b 1 ) +exit /b 0 endlocal