update (#5625)

ZhangYulongg · web-flow · commit f45c131ddf3a · 2025-12-17T21:38:14.000+08:00
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -46,6 +46,7 @@ python -m pip install -r requirements.txt
 --shuffle：是否打乱数据集，默认False不打乱
 --seed：打乱数据集时的随机种子，默认0
 --pd-metrics：开启PD分离metrics指标收集，会添加请求参数collect_metrics=True，默认False
+--ip-list：支持多个ip:port，将总请求数以及总并发数均分到每个IP，按整除取余分配。例：0.0.0.0:1211,0.0.0.0:1222，默认为空
 ```
 
 ##### /v1/chat/completions接口压测单条数据调试
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -74,6 +74,8 @@ class RequestFuncOutput:
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     prompt_tokens: int = 0  # 推理侧返回输入token数
+    reasoning_tokens: int = 0  # 思考长度
+    res_ttft: int = 0  # 包含思考首token时延
     error: str = ""
     metrics: dict = field(default_factory=dict)
 
@@ -198,11 +200,14 @@ async def async_request_eb_openai_chat_completions(
         request_id = "None"
 
         ttft = 0.0
+        res_ttft = 0.0
         st = time.perf_counter()
         most_recent_timestamp = st
         token_timestamps = []
         try:
-            async with session.post(url=api_url, json=payload, headers=headers) as response:
+            async with session.post(
+                url=api_url, json=payload, headers=headers, read_bufsize=10 * 1024 * 1024
+            ) as response:
                 data = {}
                 if response.status == 200:
                     async for chunk_bytes in response.content:
@@ -242,6 +247,14 @@ async def async_request_eb_openai_chat_completions(
                                 else:
                                     output.itl.append(timestamp - most_recent_timestamp)
 
+                                # response首token
+                                if res_ttft == 0.0:
+                                    if content:
+                                        res_ttft = choices[0]["arrival_time"]
+                                        output.res_ttft = res_ttft
+                                        usage = data.get("usage", {})
+                                        output.reasoning_tokens = max(usage.get("completion_tokens", 0) - 1, 0)
+
                                 output.generated_text += content or ""
                                 output.reasoning_content += reason_content or ""
                                 # print(f"####content:{data}")
@@ -262,6 +275,7 @@ async def async_request_eb_openai_chat_completions(
 
                     if output.generated_text.strip() == "":
                         output.success = False
+                        output.reasoning_tokens = output.output_tokens
                         output.error = "No generated text found!"
                     else:
                         output.success = True
@@ -284,7 +298,7 @@ async def async_request_eb_openai_chat_completions(
         output.request_id = request_id
 
         # 保存失败请求结果
-        if not output.success:
+        if not output.success or output.output_tokens == 0:
             with open("error_output.txt", "a") as f:
                 f.write(str(output) + "\n")
     if pbar:
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py