vllm-project · yiz-liu · Mar 28, 2026 · Mar 26, 2026 · Mar 27, 2026
@@ -0,0 +1,314 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+)
+from vllm.tool_parsers.glm4_moe_tool_parser import Glm4MoeModelToolParser
+from vllm.tool_parsers.glm47_moe_tool_parser import Glm47MoeModelToolParser
+
+
+class FakeTokenizer:
+    def get_vocab(self):
+        return {
+            "<tool_call>": 1,
+            "</tool_call>": 2,
+            "<arg_key>": 3,
+            "</arg_key>": 4,
+            "<arg_value>": 5,
+            "</arg_value>": 6,
+        }
+
+
+def _reset_streaming_state(parser):
+    parser._buffer = ""
+    parser._in_tool_call = False
+    parser.current_tool_name_sent = False
+    parser._current_tool_name = None
+    parser._pending_key = None
+    parser._streaming_string_value = False
+    parser.prev_tool_call_arr = []
+    parser.current_tool_id = -1
+    parser.streamed_args_for_tool = []
+    parser._tool_call_ids = []
+    parser._args_started = []
+    parser._args_closed = []
+    parser._seen_keys = []
+
+
+def test_create_remaining_args_delta_uses_fallback_metadata_for_args_only_delta():
+    original_delta = DeltaMessage(
+        tool_calls=[
+            DeltaToolCall(
+                index=0,
+                function=DeltaFunctionCall(arguments='{"files":['),
+            )
+        ]
+    )
+
+    result = OpenAIServingChat._create_remaining_args_delta(
+        original_delta,
+        '{"files":[{"filepath":"HumanEval-X/README.md"}]}',
+        0,
+        fallback_tool_call_id="call_files",
+        fallback_tool_call_type="function",
+        fallback_tool_call_name="builtin_read_many_files",
+    )
+
+    tc = result.tool_calls[0]
+    assert tc.index == 0
+    assert tc.id == "call_files"
+    assert tc.type == "function"
+    assert tc.function.name == "builtin_read_many_files"
+    assert tc.function.arguments == ('{"files":[{"filepath":"HumanEval-X/README.md"}]}')
+
+
+def test_create_remaining_args_delta_prefers_current_metadata_over_fallback():
+    original_delta = DeltaMessage(
+        tool_calls=[
+            DeltaToolCall(
+                index=0,
+                id="call_current",
+                type="function",
+                function=DeltaFunctionCall(
+                    name="current_name",
+                    arguments='{"files":[',
+                ),
+            )
+        ]
+    )
+
+    result = OpenAIServingChat._create_remaining_args_delta(
+        original_delta,
+        "]}",
+        0,
+        fallback_tool_call_id="call_fallback",
+        fallback_tool_call_type="function",
+        fallback_tool_call_name="fallback_name",
+    )
+
+    tc = result.tool_calls[0]
+    assert tc.id == "call_current"
+    assert tc.type == "function"
+    assert tc.function.name == "current_name"
+    assert tc.function.arguments == "]}"
+
+
+def test_record_streamed_tool_args_tracks_emitted_bytes():
+    streamed_tool_args = {0: '{"files":['}
+    delta_message = DeltaMessage(
+        tool_calls=[
+            DeltaToolCall(
+                index=0,
+                function=DeltaFunctionCall(arguments='{"filepath":"HumanEval-X/README.md"}]}'),
+            )
+        ]
+    )
+
+    OpenAIServingChat._record_streamed_tool_args(delta_message, streamed_tool_args)
+
+    assert streamed_tool_args[0] == ('{"files":[{"filepath":"HumanEval-X/README.md"}]}')
+
+
+def test_compute_remaining_tool_args_handles_compact_prefix():
+    remaining = OpenAIServingChat._compute_remaining_tool_args(
+        expected_args={"a": 1},
+        streamed_args='{"a":1',
+    )
+
+    assert remaining == "}"
+
+
+def test_compute_remaining_tool_args_handles_stringified_expected_args():
+    remaining = OpenAIServingChat._compute_remaining_tool_args(
+        expected_args='{"a":1}',
+        streamed_args='{"a":1',
+    )
+
+    assert remaining == "}"
+
+
+def test_compute_remaining_tool_args_handles_glm_mixed_whitespace_prefix():
+    expected_args = {
+        "todos": [
+            {
+                "content": "A",
+                "activeForm": "B",
+                "status": "in_progress",
+            }
+        ]
+    }
+
+    remaining = OpenAIServingChat._compute_remaining_tool_args(
+        expected_args=expected_args,
+        streamed_args=('{"todos":[{"content": "A", "activeForm": "B", "status": "in_progress"}]'),
+    )
+
+    assert remaining == "}"
+
+
+def test_compute_remaining_tool_args_backfills_missing_suffix_for_glm_partial_prefix():
+    expected_args = {
+        "todos": [
+            {
+                "content": "A",
+                "activeForm": "B",
+                "status": "in_progress",
+            }
+        ]
+    }
+
+    remaining = OpenAIServingChat._compute_remaining_tool_args(
+        expected_args=expected_args,
+        streamed_args='{"todos":[{"content": "A"',
+    )
+
+    assert remaining == ',"activeForm":"B","status":"in_progress"}]}'
+
+
+def test_compute_remaining_tool_args_returns_empty_for_non_matching_prefix():
+    remaining = OpenAIServingChat._compute_remaining_tool_args(
+        expected_args={"a": 1},
+        streamed_args="not-json",
+    )
+
+    assert remaining == ""
+
+
+def test_compute_remaining_tool_args_returns_full_call_when_no_args_were_sent():
+    remaining = OpenAIServingChat._compute_remaining_tool_args(
+        expected_args={
+            "todos": "- [x] 分析项目结构和代码\n- [ ] 添加单元测试框架",
+        },
+        streamed_args="",
+    )
+
+    assert remaining == ('{"todos": "- [x] 分析项目结构和代码\\n- [ ] 添加单元测试框架"}')
+
+
+def test_glm_streaming_final_chunk_emits_inline_string_value():
+    parser = Glm4MoeModelToolParser(FakeTokenizer())
+    _reset_streaming_state(parser)
+
+    request = ChatCompletionRequest(
+        model="zai-org/GLM-4.7",
+        messages=[],
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "builtin_get_problems",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "filepath": {"type": "string"},
+                        },
+                    },
+                },
+            }
+        ],
+    )
+
+    chunks = [
+        "<tool_call>",
+        "builtin_get_problems\n",
+        "<arg_key>filepath</arg_key>",
+        "<arg_value>pong.py</arg_value></tool_call>",
+    ]
+
+    last_tool_delta = None
+    for chunk in chunks:
+        result = parser.extract_tool_calls_streaming(
+            previous_text="",
+            current_text="",
+            delta_text=chunk,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=request,
+        )
+        if result is not None and result.tool_calls:
+            last_tool_delta = result
+
+    assert last_tool_delta is not None
+    assert last_tool_delta.tool_calls[0].function.arguments == '{"filepath":"pong.py"}'
+    assert parser.streamed_args_for_tool == ['{"filepath":"pong.py"}']
+    assert parser.prev_tool_call_arr == [
+        {
+            "name": "builtin_get_problems",
+            "arguments": {"filepath": "pong.py"},
+        }
+    ]
+
+
+def test_glm47_streaming_delta_serializes_tool_call_fields():
+    parser = Glm47MoeModelToolParser(FakeTokenizer())
+    _reset_streaming_state(parser)
+
+    request = ChatCompletionRequest(
+        model="GLM-5",
+        messages=[],
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "builtin_get_problems",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "filepath": {"type": "string"},
+                        },
+                    },
+                },
+            }
+        ],
+    )
+
+    chunks = [
+        "<tool_call>",
+        "builtin_get_problems\n",
+        "<arg_key>filepath</arg_key>",
+        "<arg_value>pong.py</arg_value></tool_call>",
+    ]
+
+    serialized_deltas = []
+    for chunk in chunks:
+        result = parser.extract_tool_calls_streaming(
+            previous_text="",
+            current_text="",
+            delta_text=chunk,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=request,
+        )
+        if result is None:
+            continue
+
+        choice = ChatCompletionResponseStreamChoice(
+            index=0,
+            delta=result,
+            logprobs=None,
+            finish_reason=None,
+        )
+        response = ChatCompletionStreamResponse(
+            id="chatcmpl-test",
+            created=0,
+            model="GLM-5",
+            choices=[choice],
+        )
+        serialized_deltas.append(response.model_dump(exclude_unset=True)["choices"][0]["delta"])
+
+    assert len(serialized_deltas) == 2
+    assert serialized_deltas[0]["tool_calls"][0]["type"] == "function"
+    assert serialized_deltas[0]["tool_calls"][0]["function"]["name"] == "builtin_get_problems"
+    assert serialized_deltas[-1] != {}
+    assert serialized_deltas[-1]["tool_calls"][0]["index"] == 0
+    assert serialized_deltas[-1]["tool_calls"][0]["function"]["arguments"] == '{"filepath":"pong.py"}'
@@ -179,6 +179,33 @@
 #       Remove this patch once the upstream MiniMax usage-accounting fix is in
 #       the runtime vLLM version used by vllm-ascend.
 #
+# ** 10. File: platform/patch_glm_tool_call_parser.py**
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.entrypoints.openai.chat_completion.serving.OpenAIServingChat`
+#      `vllm.tool_parsers.glm4_moe_tool_parser.Glm4MoeModelToolParser`
+#    Why:
+#       GLM-4.7 / GLM-4.5 tool-call streaming on the release runtime still has
+#       two independent finish-path bugs:
+#       1. the parser can leave a terminal `<arg_value>... </tool_call>` chunk
+#          partially undrained, and
+#       2. finish backfill trusts the parser's internal accumulated arguments
+#          instead of the argument bytes actually sent to the client.
+#       Together these can drop a full string value or emit only a suffix like
+#       `"}` in the final SSE chunk even when non-stream output is correct.
+#    How：
+#       Monkey-patch the GLM parser to keep draining a single chunk through
+#       terminal state transitions, and monkey-patch chat streaming to track
+#       per-tool arguments actually emitted to the client before computing the
+#       finish-chunk suffix. The suffix logic still tolerates mixed JSON
+#       whitespace styles from GLM tool parsers.
+#    Related PR (if no, explain why):
+#       https://github.com/vllm-project/vllm/pull/37845
+#       https://github.com/vllm-project/vllm/pull/33218
+#    Future Plan:
+#       Remove this patch once both the GLM parser drain fix and the serving
+#       finish-backfill fix are present in the runtime vLLM version used by
+#       vllm-ascend.
+#
 # * Worker Patch:
 # ===============
 #

@@ -30,6 +30,7 @@
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
 import vllm_ascend.patch.platform.patch_torch_accelerator  # noqa
 import vllm_ascend.patch.platform.patch_minimax_usage_accounting  # noqa
+import vllm_ascend.patch.platform.patch_glm_tool_call_parser  # noqa
 
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
     import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa