diff --git a/astrbot/core/agent/runners/tool_loop_agent_runner.py b/astrbot/core/agent/runners/tool_loop_agent_runner.py index 7eb90f3fc..88e302ad7 100644 --- a/astrbot/core/agent/runners/tool_loop_agent_runner.py +++ b/astrbot/core/agent/runners/tool_loop_agent_runner.py @@ -77,10 +77,11 @@ async def reset( async def _iter_llm_responses(self) -> T.AsyncGenerator[LLMResponse, None]: """Yields chunks *and* a final LLMResponse.""" payload = { - "contexts": self.run_context.messages, + "contexts": self.run_context.messages, # list[Message] "func_tool": self.req.func_tool, "model": self.req.model, # NOTE: in fact, this arg is None in most cases "session_id": self.req.session_id, + "extra_user_content_parts": self.req.extra_user_content_parts, # list[ContentPart] } if self.streaming: diff --git a/astrbot/core/provider/entities.py b/astrbot/core/provider/entities.py index d13e9b56a..8f1bc442e 100644 --- a/astrbot/core/provider/entities.py +++ b/astrbot/core/provider/entities.py @@ -14,6 +14,7 @@ from astrbot import logger from astrbot.core.agent.message import ( AssistantMessageSegment, + ContentPart, ToolCall, ToolCallMessageSegment, ) @@ -92,6 +93,8 @@ class ProviderRequest: """会话 ID""" image_urls: list[str] = field(default_factory=list) """图片 URL 列表""" + extra_user_content_parts: list[ContentPart] = field(default_factory=list) + """额外的用户消息内容部分列表,用于在用户消息后添加额外的内容块(如系统提醒、指令等)。支持 dict 或 ContentPart 对象""" func_tool: ToolSet | None = None """可用的函数工具""" contexts: list[dict] = field(default_factory=list) @@ -166,13 +169,23 @@ def _print_friendly_context(self): async def assemble_context(self) -> dict: """将请求(prompt 和 image_urls)包装成 OpenAI 的消息格式。""" + # 构建内容块列表 + content_blocks = [] + + # 1. 用户原始发言(OpenAI 建议:用户发言在前) + if self.prompt and self.prompt.strip(): + content_blocks.append({"type": "text", "text": self.prompt}) + elif self.image_urls: + # 如果没有文本但有图片,添加占位文本 + content_blocks.append({"type": "text", "text": "[图片]"}) + + # 2. 额外的内容块(系统提醒、指令等) + if self.extra_user_content_parts: + for part in self.extra_user_content_parts: + content_blocks.append(part.model_dump()) + + # 3. 图片内容 if self.image_urls: - user_content = { - "role": "user", - "content": [ - {"type": "text", "text": self.prompt if self.prompt else "[图片]"}, - ], - } for image_url in self.image_urls: if image_url.startswith("http"): image_path = await download_image_by_url(image_url) @@ -185,11 +198,21 @@ async def assemble_context(self) -> dict: if not image_data: logger.warning(f"图片 {image_url} 得到的结果为空,将忽略。") continue - user_content["content"].append( + content_blocks.append( {"type": "image_url", "image_url": {"url": image_data}}, ) - return user_content - return {"role": "user", "content": self.prompt} + + # 只有当只有一个来自 prompt 的文本块且没有额外内容块时,才降级为简单格式以保持向后兼容 + if ( + len(content_blocks) == 1 + and content_blocks[0]["type"] == "text" + and not self.extra_user_content_parts + and not self.image_urls + ): + return {"role": "user", "content": content_blocks[0]["text"]} + + # 否则返回多模态格式 + return {"role": "user", "content": content_blocks} async def _encode_image_bs64(self, image_url: str) -> str: """将图片转换为 base64""" diff --git a/astrbot/core/provider/provider.py b/astrbot/core/provider/provider.py index 7f21a2ee1..6fb6d8953 100644 --- a/astrbot/core/provider/provider.py +++ b/astrbot/core/provider/provider.py @@ -4,7 +4,7 @@ from collections.abc import AsyncGenerator from typing import TypeAlias, Union -from astrbot.core.agent.message import Message +from astrbot.core.agent.message import ContentPart, Message from astrbot.core.agent.tool import ToolSet from astrbot.core.provider.entities import ( LLMResponse, @@ -103,6 +103,7 @@ async def text_chat( system_prompt: str | None = None, tool_calls_result: ToolCallsResult | list[ToolCallsResult] | None = None, model: str | None = None, + extra_user_content_parts: list[ContentPart] | None = None, **kwargs, ) -> LLMResponse: """获得 LLM 的文本对话结果。会使用当前的模型进行对话。 @@ -114,6 +115,7 @@ async def text_chat( tools: tool set contexts: 上下文,和 prompt 二选一使用 tool_calls_result: 回传给 LLM 的工具调用结果。参考: https://platform.openai.com/docs/guides/function-calling + extra_user_content_parts: 额外的内容块列表,用于在用户消息后添加额外的文本块(如系统提醒、指令等) kwargs: 其他参数 Notes: diff --git a/astrbot/core/provider/sources/anthropic_source.py b/astrbot/core/provider/sources/anthropic_source.py index 0ff61e393..d86b8393e 100644 --- a/astrbot/core/provider/sources/anthropic_source.py +++ b/astrbot/core/provider/sources/anthropic_source.py @@ -11,6 +11,7 @@ from astrbot import logger from astrbot.api.provider import Provider +from astrbot.core.agent.message import ContentPart, ImageURLPart, TextPart from astrbot.core.provider.entities import LLMResponse, TokenUsage from astrbot.core.provider.func_tool_manager import ToolSet from astrbot.core.utils.io import download_image_by_url @@ -296,13 +297,16 @@ async def text_chat( system_prompt=None, tool_calls_result=None, model=None, + extra_user_content_parts=None, **kwargs, ) -> LLMResponse: if contexts is None: contexts = [] new_record = None if prompt is not None: - new_record = await self.assemble_context(prompt, image_urls) + new_record = await self.assemble_context( + prompt, image_urls, extra_user_content_parts + ) context_query = self._ensure_message_to_dicts(contexts) if new_record: context_query.append(new_record) @@ -350,13 +354,16 @@ async def text_chat_stream( system_prompt=None, tool_calls_result=None, model=None, + extra_user_content_parts=None, **kwargs, ): if contexts is None: contexts = [] new_record = None if prompt is not None: - new_record = await self.assemble_context(prompt, image_urls) + new_record = await self.assemble_context( + prompt, image_urls, extra_user_content_parts + ) context_query = self._ensure_message_to_dicts(contexts) if new_record: context_query.append(new_record) @@ -388,15 +395,15 @@ async def text_chat_stream( async for llm_response in self._query_stream(payloads, func_tool): yield llm_response - async def assemble_context(self, text: str, image_urls: list[str] | None = None): + async def assemble_context( + self, + text: str, + image_urls: list[str] | None = None, + extra_user_content_parts: list[ContentPart] | None = None, + ): """组装上下文,支持文本和图片""" - if not image_urls: - return {"role": "user", "content": text} - - content = [] - content.append({"type": "text", "text": text}) - for image_url in image_urls: + async def resolve_image_url(image_url: str) -> dict | None: if image_url.startswith("http"): image_path = await download_image_by_url(image_url) image_data = await self.encode_image_bs64(image_path) @@ -408,28 +415,68 @@ async def assemble_context(self, text: str, image_urls: list[str] | None = None) if not image_data: logger.warning(f"图片 {image_url} 得到的结果为空,将忽略。") - continue + return None # Get mime type for the image mime_type, _ = guess_type(image_url) if not mime_type: mime_type = "image/jpeg" # Default to JPEG if can't determine - content.append( - { - "type": "image", - "source": { - "type": "base64", - "media_type": mime_type, - "data": ( - image_data.split("base64,")[1] - if "base64," in image_data - else image_data - ), - }, + return { + "type": "image", + "source": { + "type": "base64", + "media_type": mime_type, + "data": ( + image_data.split("base64,")[1] + if "base64," in image_data + else image_data + ), }, - ) + } + + content = [] + # 1. 用户原始发言(OpenAI 建议:用户发言在前) + if text: + content.append({"type": "text", "text": text}) + elif image_urls: + # 如果没有文本但有图片,添加占位文本 + content.append({"type": "text", "text": "[图片]"}) + elif extra_user_content_parts: + # 如果只有额外内容块,也需要添加占位文本 + content.append({"type": "text", "text": " "}) + + # 2. 额外的内容块(系统提醒、指令等) + if extra_user_content_parts: + for block in extra_user_content_parts: + if isinstance(block, TextPart): + content.append({"type": "text", "text": block.text}) + elif isinstance(block, ImageURLPart): + image_dict = await resolve_image_url(block.image_url.url) + if image_dict: + content.append(image_dict) + else: + raise ValueError(f"不支持的额外内容块类型: {type(block)}") + + # 3. 图片内容 + if image_urls: + for image_url in image_urls: + image_dict = await resolve_image_url(image_url) + if image_dict: + content.append(image_dict) + + # 如果只有主文本且没有额外内容块和图片,返回简单格式以保持向后兼容 + if ( + text + and not extra_user_content_parts + and not image_urls + and len(content) == 1 + and content[0]["type"] == "text" + ): + return {"role": "user", "content": content[0]["text"]} + + # 否则返回多模态格式 return {"role": "user", "content": content} async def encode_image_bs64(self, image_url: str) -> str: diff --git a/astrbot/core/provider/sources/gemini_source.py b/astrbot/core/provider/sources/gemini_source.py index 7f3700643..46358ac26 100644 --- a/astrbot/core/provider/sources/gemini_source.py +++ b/astrbot/core/provider/sources/gemini_source.py @@ -13,6 +13,7 @@ import astrbot.core.message.components as Comp from astrbot import logger from astrbot.api.provider import Provider +from astrbot.core.agent.message import ContentPart, ImageURLPart, TextPart from astrbot.core.message.message_event_result import MessageChain from astrbot.core.provider.entities import LLMResponse, TokenUsage from astrbot.core.provider.func_tool_manager import ToolSet @@ -680,13 +681,16 @@ async def text_chat( system_prompt=None, tool_calls_result=None, model=None, + extra_user_content_parts=None, **kwargs, ) -> LLMResponse: if contexts is None: contexts = [] new_record = None if prompt is not None: - new_record = await self.assemble_context(prompt, image_urls) + new_record = await self.assemble_context( + prompt, image_urls, extra_user_content_parts + ) context_query = self._ensure_message_to_dicts(contexts) if new_record: context_query.append(new_record) @@ -732,13 +736,16 @@ async def text_chat_stream( system_prompt=None, tool_calls_result=None, model=None, + extra_user_content_parts=None, **kwargs, ) -> AsyncGenerator[LLMResponse, None]: if contexts is None: contexts = [] new_record = None if prompt is not None: - new_record = await self.assemble_context(prompt, image_urls) + new_record = await self.assemble_context( + prompt, image_urls, extra_user_content_parts + ) context_query = self._ensure_message_to_dicts(contexts) if new_record: context_query.append(new_record) @@ -797,33 +804,75 @@ def set_key(self, key): self.chosen_api_key = key self._init_client() - async def assemble_context(self, text: str, image_urls: list[str] | None = None): + async def assemble_context( + self, + text: str, + image_urls: list[str] | None = None, + extra_user_content_parts: list[ContentPart] | None = None, + ): """组装上下文。""" - if image_urls: - user_content = { - "role": "user", - "content": [{"type": "text", "text": text if text else "[图片]"}], + + async def resolve_image_part(image_url: str) -> dict | None: + if image_url.startswith("http"): + image_path = await download_image_by_url(image_url) + image_data = await self.encode_image_bs64(image_path) + elif image_url.startswith("file:///"): + image_path = image_url.replace("file:///", "") + image_data = await self.encode_image_bs64(image_path) + else: + image_data = await self.encode_image_bs64(image_url) + if not image_data: + logger.warning(f"图片 {image_url} 得到的结果为空,将忽略。") + return None + return { + "type": "image_url", + "image_url": {"url": image_data}, } - for image_url in image_urls: - if image_url.startswith("http"): - image_path = await download_image_by_url(image_url) - image_data = await self.encode_image_bs64(image_path) - elif image_url.startswith("file:///"): - image_path = image_url.replace("file:///", "") - image_data = await self.encode_image_bs64(image_path) + + # 构建内容块列表 + content_blocks = [] + + # 1. 用户原始发言(OpenAI 建议:用户发言在前) + if text: + content_blocks.append({"type": "text", "text": text}) + elif image_urls: + # 如果没有文本但有图片,添加占位文本 + content_blocks.append({"type": "text", "text": "[图片]"}) + elif extra_user_content_parts: + # 如果只有额外内容块,也需要添加占位文本 + content_blocks.append({"type": "text", "text": " "}) + + # 2. 额外的内容块(系统提醒、指令等) + if extra_user_content_parts: + for part in extra_user_content_parts: + if isinstance(part, TextPart): + content_blocks.append({"type": "text", "text": part.text}) + elif isinstance(part, ImageURLPart): + image_part = await resolve_image_part(part.image_url.url) + if image_part: + content_blocks.append(image_part) else: - image_data = await self.encode_image_bs64(image_url) - if not image_data: - logger.warning(f"图片 {image_url} 得到的结果为空,将忽略。") - continue - user_content["content"].append( - { - "type": "image_url", - "image_url": {"url": image_data}, - }, - ) - return user_content - return {"role": "user", "content": text} + raise ValueError(f"不支持的额外内容块类型: {type(part)}") + + # 3. 图片内容 + if image_urls: + for image_url in image_urls: + image_part = await resolve_image_part(image_url) + if image_part: + content_blocks.append(image_part) + + # 如果只有主文本且没有额外内容块和图片,返回简单格式以保持向后兼容 + if ( + text + and not extra_user_content_parts + and not image_urls + and len(content_blocks) == 1 + and content_blocks[0]["type"] == "text" + ): + return {"role": "user", "content": content_blocks[0]["text"]} + + # 否则返回多模态格式 + return {"role": "user", "content": content_blocks} async def encode_image_bs64(self, image_url: str) -> str: """将图片转换为 base64""" diff --git a/astrbot/core/provider/sources/openai_source.py b/astrbot/core/provider/sources/openai_source.py index a716d0a5a..1212e8b00 100644 --- a/astrbot/core/provider/sources/openai_source.py +++ b/astrbot/core/provider/sources/openai_source.py @@ -17,7 +17,7 @@ import astrbot.core.message.components as Comp from astrbot import logger from astrbot.api.provider import Provider -from astrbot.core.agent.message import Message +from astrbot.core.agent.message import ContentPart, ImageURLPart, Message, TextPart from astrbot.core.agent.tool import ToolSet from astrbot.core.message.message_event_result import MessageChain from astrbot.core.provider.entities import LLMResponse, TokenUsage, ToolCallsResult @@ -348,6 +348,7 @@ async def _prepare_chat_payload( system_prompt: str | None = None, tool_calls_result: ToolCallsResult | list[ToolCallsResult] | None = None, model: str | None = None, + extra_user_content_parts: list[ContentPart] | None = None, **kwargs, ) -> tuple: """准备聊天所需的有效载荷和上下文""" @@ -355,7 +356,9 @@ async def _prepare_chat_payload( contexts = [] new_record = None if prompt is not None: - new_record = await self.assemble_context(prompt, image_urls) + new_record = await self.assemble_context( + prompt, image_urls, extra_user_content_parts + ) context_query = self._ensure_message_to_dicts(contexts) if new_record: context_query.append(new_record) @@ -476,6 +479,7 @@ async def text_chat( system_prompt=None, tool_calls_result=None, model=None, + extra_user_content_parts=None, **kwargs, ) -> LLMResponse: payloads, context_query = await self._prepare_chat_payload( @@ -485,6 +489,7 @@ async def text_chat( system_prompt, tool_calls_result, model=model, + extra_user_content_parts=extra_user_content_parts, **kwargs, ) @@ -624,33 +629,71 @@ async def assemble_context( self, text: str, image_urls: list[str] | None = None, + extra_user_content_parts: list[ContentPart] | None = None, ) -> dict: """组装成符合 OpenAI 格式的 role 为 user 的消息段""" - if image_urls: - user_content = { - "role": "user", - "content": [{"type": "text", "text": text if text else "[图片]"}], + + async def resolve_image_part(image_url: str) -> dict | None: + if image_url.startswith("http"): + image_path = await download_image_by_url(image_url) + image_data = await self.encode_image_bs64(image_path) + elif image_url.startswith("file:///"): + image_path = image_url.replace("file:///", "") + image_data = await self.encode_image_bs64(image_path) + else: + image_data = await self.encode_image_bs64(image_url) + if not image_data: + logger.warning(f"图片 {image_url} 得到的结果为空,将忽略。") + return None + return { + "type": "image_url", + "image_url": {"url": image_data}, } - for image_url in image_urls: - if image_url.startswith("http"): - image_path = await download_image_by_url(image_url) - image_data = await self.encode_image_bs64(image_path) - elif image_url.startswith("file:///"): - image_path = image_url.replace("file:///", "") - image_data = await self.encode_image_bs64(image_path) + + # 构建内容块列表 + content_blocks = [] + + # 1. 用户原始发言(OpenAI 建议:用户发言在前) + if text: + content_blocks.append({"type": "text", "text": text}) + elif image_urls: + # 如果没有文本但有图片,添加占位文本 + content_blocks.append({"type": "text", "text": "[图片]"}) + elif extra_user_content_parts: + # 如果只有额外内容块,也需要添加占位文本 + content_blocks.append({"type": "text", "text": " "}) + + # 2. 额外的内容块(系统提醒、指令等) + if extra_user_content_parts: + for part in extra_user_content_parts: + if isinstance(part, TextPart): + content_blocks.append({"type": "text", "text": part.text}) + elif isinstance(part, ImageURLPart): + image_part = await resolve_image_part(part.image_url.url) + if image_part: + content_blocks.append(image_part) else: - image_data = await self.encode_image_bs64(image_url) - if not image_data: - logger.warning(f"图片 {image_url} 得到的结果为空,将忽略。") - continue - user_content["content"].append( - { - "type": "image_url", - "image_url": {"url": image_data}, - }, - ) - return user_content - return {"role": "user", "content": text} + raise ValueError(f"不支持的额外内容块类型: {type(part)}") + + # 3. 图片内容 + if image_urls: + for image_url in image_urls: + image_part = await resolve_image_part(image_url) + if image_part: + content_blocks.append(image_part) + + # 如果只有主文本且没有额外内容块和图片,返回简单格式以保持向后兼容 + if ( + text + and not extra_user_content_parts + and not image_urls + and len(content_blocks) == 1 + and content_blocks[0]["type"] == "text" + ): + return {"role": "user", "content": content_blocks[0]["text"]} + + # 否则返回多模态格式 + return {"role": "user", "content": content_blocks} async def encode_image_bs64(self, image_url: str) -> str: """将图片转换为 base64""" diff --git a/packages/astrbot/process_llm_request.py b/packages/astrbot/process_llm_request.py index 89a4df3a2..28d0a34f4 100644 --- a/packages/astrbot/process_llm_request.py +++ b/packages/astrbot/process_llm_request.py @@ -7,6 +7,7 @@ from astrbot.api.event import AstrMessageEvent from astrbot.api.message_components import Image, Reply from astrbot.api.provider import Provider, ProviderRequest +from astrbot.core.agent.message import TextPart from astrbot.core.provider.func_tool_manager import ToolSet @@ -85,7 +86,9 @@ async def _ensure_img_caption( req.image_urls, ) if caption: - req.prompt = f"(Image Caption: {caption})\n\n{req.prompt}" + req.extra_user_content_parts.append( + TextPart(text=f"{caption}") + ) req.image_urls = [] except Exception as e: logger.error(f"处理图片描述失败: {e}") @@ -129,13 +132,14 @@ async def process_llm_request(self, event: AstrMessageEvent, req: ProviderReques else: req.prompt = prefix + req.prompt + # 收集系统提醒信息 + system_parts = [] + # user identifier if cfg.get("identifier"): user_id = event.message_obj.sender.user_id user_nickname = event.message_obj.sender.nickname - req.prompt = ( - f"\n[User ID: {user_id}, Nickname: {user_nickname}]\n{req.prompt}" - ) + system_parts.append(f"User ID: {user_id}, Nickname: {user_nickname}") # group name identifier if cfg.get("group_name_display") and event.message_obj.group_id: @@ -146,7 +150,7 @@ async def process_llm_request(self, event: AstrMessageEvent, req: ProviderReques return group_name = event.message_obj.group.group_name if group_name: - req.system_prompt += f"\nGroup name: {group_name}\n" + system_parts.append(f"Group name: {group_name}") # time info if cfg.get("datetime_system_prompt"): @@ -162,7 +166,7 @@ async def process_llm_request(self, event: AstrMessageEvent, req: ProviderReques current_time = ( datetime.datetime.now().astimezone().strftime("%Y-%m-%d %H:%M (%Z)") ) - req.system_prompt += f"\nCurrent datetime: {current_time}\n" + system_parts.append(f"Current datetime: {current_time}") img_cap_prov_id: str = cfg.get("default_image_caption_provider_id") or "" if req.conversation: @@ -225,10 +229,17 @@ async def process_llm_request(self, event: AstrMessageEvent, req: ProviderReques except BaseException as e: logger.error(f"处理引用图片失败: {e}") - # 3. 将所有部分组合成文本并直接注入到当前消息中 + # 3. 将所有部分组合成文本并添加到 extra_user_content_parts 中 # 确保引用内容被正确的标签包裹 quoted_content = "\n".join(content_parts) # 确保所有内容都在标签内 quoted_text = f"\n{quoted_content}\n" - req.prompt = f"{quoted_text}\n\n{req.prompt}" + req.extra_user_content_parts.append(TextPart(text=quoted_text)) + + # 统一包裹所有系统提醒 + if system_parts: + system_content = ( + "" + "\n".join(system_parts) + "" + ) + req.extra_user_content_parts.append(TextPart(text=system_content))