feat: implement GLM41VChatHandler for GLM-4.1V-9B-Thinking Model

JamePeng · JamePeng · commit 236eff59badb · 2025-12-18T00:44:54.000+08:00
- Patch stop tokens in __call__ to handle &lt;/answer&gt; and EOS truncation.

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/README.md b/README.md
@@ -496,6 +496,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6`, `minicpm-v-4.0` |
 | [gemma3](https://huggingface.co/unsloth/gemma-3-27b-it-GGUF) | `Gemma3ChatHandler` | `gemma3` |
+| [glm4.1v](https://huggingface.co/unsloth/GLM-4.1V-9B-Thinking-GGUF) | `GLM41VChatHandler` | `glm4.1v` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
 | [qwen3-vl](https://huggingface.co/unsloth/Qwen3-VL-8B-Thinking-GGUF) | `Qwen3VLChatHandler` | `qwen3-vl` |
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -3717,6 +3717,83 @@ class Gemma3ChatHandler(Llava15ChatHandler):
     )
 
 
+class GLM41VChatHandler(Llava15ChatHandler):
+    # Note: Make sure the GGUF files of your converted model and mmproj are F16 or F32.
+
+    GLM41V_EOS_TOKEN = "<|endoftext|>"
+    GLM41V_PAD_TOKEN = "<|endoftext|>"
+    GLM41V_IMAGE_START_TOKEN = "<|begin_of_image|>"
+    GLM41V_IMAGE_END_TOKEN = "<|end_of_image|>"
+
+    CHAT_FORMAT = (
+        "[gMASK]<sop>\n"
+        "{%- for msg in messages -%}"
+            "{%- if msg.role == 'system' -%}"
+                "<|system|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+            "{%- elif msg.role == 'user' -%}"
+                "<|user|>\n"
+                "{%- if msg.content is string -%}"
+                    "{{ msg.content }}"
+                "{%- else -%}"
+                    "{%- for item in msg.content -%}"
+                        "{%- if item.type == 'image_url' or 'image_url' in item -%}"
+                            "<|begin_of_image|>"
+                            "{%- if item.image_url is string -%}"
+                                "{{- item.image_url -}}"
+                            "{%- else -%}"
+                                "{{- item.image_url.url -}}"
+                            "{%- endif -%}"
+                            "<|end_of_image|>"
+                        "{%- elif item.type == 'text' -%}"
+                            "{{ item.text }}"
+                        "{%- endif -%}"
+                    "{%- endfor -%}"
+                "{%- endif -%}{{ GLM41V_EOS_TOKEN }}"
+            "{%- elif msg.role == 'assistant' -%}"
+                "{%- if msg.metadata -%}"
+                    "<|assistant|>{{ msg.metadata }}\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+                "{%- else -%}"
+                    "<|assistant|>\n{{ msg.content }}{{ GLM41V_EOS_TOKEN }}"
+                "{%- endif -%}"
+            "{%- endif -%}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+            "<|assistant|>\n"
+        "{%- endif -%}"
+    )
+
+    def __call__(self, **kwargs):
+        self.extra_template_arguments["GLM41V_EOS_TOKEN"] = self.GLM41V_EOS_TOKEN
+        stop_tokens = [self.GLM41V_EOS_TOKEN, "</answer>"] # Stop token patch
+        kwargs['stop'] = stop_tokens
+
+        llama = kwargs['llama']
+
+        # Clear state for multiple runs
+        llama.reset()
+        llama._ctx.memory_clear(True)
+        llama.n_tokens = 0
+
+        if hasattr(llama, 'input_ids'):
+            llama.input_ids.fill(0)
+
+        # Clear any handler state
+        if hasattr(self, '_last_image_embed'):
+            self._last_image_embed = None
+            self._last_image_hash = None
+
+        if self.verbose:
+            messages = kwargs.get('messages', [])
+            try:
+                image_count = len(self.get_image_urls(messages))
+                print(f"GLM4VChatHandler - Processing {image_count} images", file=sys.stderr)
+            except Exception:
+                print(f"GLM4VChatHandler - State reset", file=sys.stderr)
+
+        # Use parent implementation
+        return super().__call__(**kwargs)
+
+
 class Qwen25VLChatHandler(Llava15ChatHandler):
     DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."