Skip to content

Commit fcb0124

Browse files
korotaav48Aleksei Korota
andauthored
chore: response_format_integration_test (#46)
Co-authored-by: Aleksei Korota <Aleksei_Korota@epam.com>
1 parent 1c54c58 commit fcb0124

File tree

8 files changed

+310
-59
lines changed

8 files changed

+310
-59
lines changed

src/tests/integration_tests/test_e2e.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,30 +47,30 @@
4747
# Individual test for GPT-5
4848
@pytest.mark.requires_session
4949
@pytest.mark.e2e
50-
@e2e_test(test_case=E2E_TEST_CASE, models=["gpt-5-2025-08-07"], runs=1)
50+
@e2e_test(test_case=E2E_TEST_CASE, model="gpt-5-2025-08-07", runs=1)
5151
def test_e2e_set_gpt5(client):
5252
pass
5353

5454

5555
# Individual test for GPT-4 Turbo
5656
@pytest.mark.requires_session
5757
@pytest.mark.e2e
58-
@e2e_test(test_case=E2E_TEST_CASE, models=["gpt-4.1-2025-04-14"], runs=1)
58+
@e2e_test(test_case=E2E_TEST_CASE, model="gpt-4.1-2025-04-14", runs=1)
5959
def test_e2e_set_gpt4_1(client):
6060
pass
6161

6262

6363
# Individual test for Claude 4.5
6464
# @pytest.mark.requires_session
6565
# @pytest.mark.e2e
66-
# @e2e_test(test_case=E2E_TEST_CASE, models=["anthropic.claude-v4-5-sonnet-v1"], runs=1)
66+
# @e2e_test(test_case=E2E_TEST_CASE, model="anthropic.claude-v4-5-sonnet-v1", runs=1)
6767
# def test_e2e_set_claude45(client):
6868
# pass
6969

7070

7171
# Individual test for Gemini
7272
@pytest.mark.requires_session
7373
@pytest.mark.e2e
74-
@e2e_test(test_case=E2E_TEST_CASE, models=["gemini-2.5-pro"], runs=1)
74+
@e2e_test(test_case=E2E_TEST_CASE, model="gemini-2.5-pro", runs=1)
7575
def test_e2e_set_gemini(client):
7676
pass

src/tests/integration_tests/test_runner/cache/cache_middleware.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import json
44
import logging
55
import warnings
6-
from dataclasses import dataclass
76
from pathlib import Path
87
from typing import List
98
from urllib.parse import urlparse
@@ -28,7 +27,7 @@
2827
logger = logging.getLogger("__name__")
2928

3029

31-
AGENT_MODELS = ["gpt-4.1-2025-04-14", "gpt-5-2025-08-07", "claude-opus-4@20250514", "us.anthropic.claude-3-7-sonnet-20250219-v1", "gemini-2.5-pro", "anthropic.claude-v4-5-sonnet-v1"] # Specify all models that should be not cached.
30+
AGENT_MODELS = ["gpt-4.1-2025-04-14", "gpt-5-2025-08-07", "gpt-5-mini-2025-08-07", "claude-opus-4@20250514", "us.anthropic.claude-3-7-sonnet-20250219-v1", "anthropic.claude-v4-5-sonnet-v1", "gemini-2.5-pro"] # Specify all models that should be not cached.
3231

3332

3433
@dataclass
@@ -54,7 +53,6 @@ def _extract_host_port(url: str) -> str:
5453

5554
class CacheMiddlewareApp(FastAPI):
5655
llm_cache: LlmCache
57-
_background_tasks: List[asyncio.Task] = []
5856

5957
def __init__(self, app_config: CacheMiddlewareConfig):
6058
self.target_url = app_config.dial_core_url
@@ -70,7 +68,7 @@ def __init__(self, app_config: CacheMiddlewareConfig):
7068
enable_cache=True,
7169
)
7270
self.used_cache_responses = set()
73-
self._background_tasks = []
71+
self._background_tasks: List[asyncio.Task] = []
7472

7573
super().__init__()
7674
self.router = APIRouter()
@@ -115,8 +113,26 @@ async def close_resources(self):
115113
logger.warning(f"Error waiting for tasks: {e}")
116114

117115
if not self.http_client.is_closed:
118-
await self.http_client.aclose()
119-
logger.debug("HTTP client closed")
116+
try:
117+
# Check if event loop is still running before attempting to close
118+
loop = asyncio.get_event_loop()
119+
if loop.is_closed():
120+
logger.debug("HTTP client close skipped - event loop is already closed")
121+
else:
122+
await self.http_client.aclose()
123+
logger.debug("HTTP client closed")
124+
125+
# Wait a bit for any cleanup tasks to complete
126+
await asyncio.sleep(0.1)
127+
128+
except RuntimeError as e:
129+
# Event loop may already be closing
130+
if "Event loop is closed" in str(e):
131+
logger.debug("HTTP client close skipped - event loop is closing")
132+
else:
133+
logger.warning(f"Error closing HTTP client: {e}")
134+
except Exception as e:
135+
logger.warning(f"Unexpected error closing HTTP client: {e}")
120136

121137
logger.debug("CacheMiddlewareApp resources closed")
122138

src/tests/integration_tests/test_runner/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
file_sets = {
2121
"integration": ["test_tool_set_chat_hub", "test_tool_set_py_interpreter", "test_mcp_tool"],
22+
"integration_simple": ["test_tool_set_chat_hub"],
2223
"e2e": ["test_tool_set_chat_hub", "test_tool_set_py_interpreter"]
2324
}
2425

@@ -47,7 +48,7 @@ class TestConfig:
4748
DEFAULT_MODEL = os.getenv("MODEL", "gpt4_1") # "gpt4o", "claude35", "claude37"
4849
REMOTE_DIAL_API_KEY = SecretStr(os.getenv("REMOTE_DIAL_API_KEY", "dial_api_key"))
4950

50-
PY_INTERPRETER_URL = "https://dev-dial-core.staging.deltixhub.io"
51+
PY_INTERPRETER_URL = os.getenv("PY_INTERPRETER_URL")
5152
PY_INTERPRETER_API_KEY = SecretStr(os.getenv("PY_INTERPRETER_API_KEY", REMOTE_DIAL_API_KEY))
5253

5354
WARNING_MESSAGE = "No cached value found, this means that something was changed in the logic"

src/tests/integration_tests/test_runner/e2e_runner.py

Lines changed: 86 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,21 @@ async def start_server(refresh: bool, test_name: str, port: int, model: str, no_
9191

9292
@staticmethod
9393
async def stop_server(server, cache_middleware_app):
94+
# Signal server to shut down first
95+
server.should_exit = True
96+
logger.debug("Signaling server shutdown...")
9497

95-
await cache_middleware_app.close_resources()
98+
# Give server a moment to start shutting down
99+
await asyncio.sleep(0.5)
96100

97-
server.should_exit = True
98-
logger.debug("Shutting down server...")
99-
await asyncio.sleep(1)
101+
# Then close app resources
102+
try:
103+
await cache_middleware_app.close_resources()
104+
except Exception as e:
105+
logger.warning(f"Error during resource cleanup: {e}")
106+
107+
# Give more time for cleanup to complete
108+
await asyncio.sleep(1.5)
100109

101110
@staticmethod
102111
async def get_attachment_url(dial_url: str, headers, attachment: Path):
@@ -140,13 +149,22 @@ async def execute_test_case(
140149
message["custom_content"] = {"attachments": attachment_objects}
141150
messages.append(message)
142151
logger.debug(f"send {message} to {client.base_url}")
152+
153+
# Prepare request payload
154+
request_payload = {
155+
"model": TestDialCoreConfig.APP_DEPLOYMENT_V2_NAME,
156+
"messages": messages,
157+
}
158+
159+
# Add response_format if specified in test case
160+
if test_case.response_format:
161+
request_payload["response_format"] = test_case.response_format
162+
logger.debug(f"Using response_format: {test_case.response_format}")
163+
143164
response = client.post(
144165
TestConfig.API_ENDPOINTS['CHAT_COMPLETIONS'],
145166
headers=headers,
146-
json={
147-
"model": TestDialCoreConfig.APP_DEPLOYMENT_V2_NAME,
148-
"messages": messages,
149-
},
167+
json=request_payload,
150168
timeout=100.0,
151169
)
152170

@@ -176,6 +194,16 @@ async def execute_test_case(
176194
break
177195

178196
logger.info(f"content:{response_message.content}")
197+
198+
# Validate response format if specified
199+
if test_case.response_format:
200+
format_failures = ResponseValidator.validate_json_schema_response(
201+
response_message.content, test_case.response_format, ts
202+
)
203+
if format_failures:
204+
ts.increment_failure(FailureReason.ANSWER)
205+
all_failures.extend(format_failures)
206+
179207
# Check message answer if expected
180208
if test_message_data.answer:
181209
failures = check_multiple_alternatives(
@@ -256,13 +284,18 @@ def e2e_test(
256284
test_case: TstCase = None,
257285
app_config_path: Path = None,
258286
model: str = None,
259-
models: List[str] = None,
287+
models_applicable_for_test: List[str] = None,
260288
refresh: bool = None,
261289
config_file_set: str = "e2e",
262290
runs: int = 3,
291+
no_cache: bool = False,
263292
):
264293
"""
265294
Decorator for end-to-end tests.
295+
296+
Args:
297+
no_cache: If True, bypass cache for this test. Can also be set globally via --no-cache CLI flag.
298+
CLI flag takes precedence over decorator parameter.
266299
"""
267300

268301
if refresh is None:
@@ -281,35 +314,44 @@ async def wrapper(request, recwarn, unique_port, *args, **kwargs):
281314
f"{test_case.name if test_case else request.node.name}"
282315
)
283316

284-
execution_model_list = models if models else []
285-
286-
if len(execution_model_list) == 0:
287-
if execution_model_list:
288-
execution_model_list.append(model)
289-
elif request.config.getoption("--model"):
290-
execution_model_list.append(request.config.getoption("--model"))
317+
model_to_use: str
318+
if model:
319+
model_to_use = model
320+
logger.debug(f"Using model from parameter defined in test: {model_to_use}")
321+
elif request.config.getoption("--model"):
322+
cli_model = request.config.getoption("--model")
323+
if models_applicable_for_test is None or len(
324+
models_applicable_for_test) == 0 or cli_model in models_applicable_for_test:
325+
model_to_use = cli_model
326+
logger.debug(f"Using model from CLI option: {model_to_use}")
291327
else:
292-
execution_model_list.append(TestConfig.DEFAULT_MODEL)
293-
294-
for m in execution_model_list:
295-
# Run the test multiple times according to the runs parameter
296-
ts = TestStats(f"{test_name}[{m}]", 0, 0)
297-
for run_index in range(runs):
298-
logger.info(f"Running test iteration {run_index + 1}/{runs}")
299-
failures = await prepare_and_execute_test(
300-
args,
301-
kwargs,
302-
recwarn,
303-
request,
304-
unique_port,
305-
execution_model=m,
306-
test_name=test_name,
307-
test_stats=ts,
308-
run_index=run_index,
309-
)
310-
all_runs_failures.extend(failures)
311-
logger.info(ts)
312-
report_test_stats(request.config, ts)
328+
logger.debug(
329+
f"Model '{cli_model}' is not in the applicable models list: {models_applicable_for_test}")
330+
pytest.skip(f"Model '{cli_model}' is not applicable for this test")
331+
else:
332+
logger.debug("No model specified")
333+
pytest.fail("No model specified for test")
334+
335+
336+
337+
# Run the test multiple times according to the runs parameter
338+
ts = TestStats(f"{test_name}[{model_to_use}]", 0, 0)
339+
for run_index in range(runs):
340+
logger.info(f"Running test iteration {run_index + 1}/{runs}")
341+
failures = await prepare_and_execute_test(
342+
args,
343+
kwargs,
344+
recwarn,
345+
request,
346+
unique_port,
347+
execution_model=model_to_use,
348+
test_name=test_name,
349+
test_stats=ts,
350+
run_index=run_index,
351+
)
352+
all_runs_failures.extend(failures)
353+
logger.info(ts)
354+
report_test_stats(request.config, ts)
313355

314356
# After all runs/models are complete, check if any failures occurred
315357
TestRunner.check_test_outcome(all_runs_failures)
@@ -334,13 +376,16 @@ async def prepare_and_execute_test(
334376

335377
client = TestClient(app)
336378

337-
no_cache = bool(request.config.getoption("--no-cache", default=False))
379+
# Combine CLI flag with decorator parameter - CLI takes precedence
380+
cli_no_cache = bool(request.config.getoption("--no-cache", default=False))
381+
effective_no_cache = cli_no_cache or no_cache
382+
338383
task, server, middleware = await TestRunner.start_server(
339384
model=execution_model,
340385
test_name=test_name,
341386
refresh=refresh,
342387
port=unique_port,
343-
no_cache=no_cache
388+
no_cache=effective_no_cache
344389
)
345390
try:
346391
run_failures, test_result = await execute_single_test_run(
@@ -362,13 +407,8 @@ async def prepare_and_execute_test(
362407

363408
finally:
364409
await TestRunner.stop_server(server, middleware)
365-
# Properly close the client
366-
if hasattr(client, "aclose"):
367-
await client.aclose()
368-
# Shutdown async generators
369-
loop = asyncio.get_event_loop()
370-
if loop.is_running():
371-
await loop.shutdown_asyncgens()
410+
# TestClient is synchronous and doesn't need async close
411+
# Don't shutdown async generators while loop is running
372412

373413
return wrapper
374414

src/tests/integration_tests/test_runner/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,13 +177,15 @@ def __init__(
177177
name: str,
178178
description: str,
179179
similarity_threshold: float = SimilarityThreshold.DEFAULT.value,
180+
response_format: Dict[str, Any] | None = None,
180181
):
181182
self.name = name
182183
self.description = description
183184
self.messages: List[UserMessage] = []
184185
self.mock_date = datetime.date.today()
185186
self.similarity_threshold = similarity_threshold
186187
self.py_interpreter_session_flow = False
188+
self.response_format = response_format
187189

188190
def add_user_message(
189191
self,

src/tests/integration_tests/test_runner/utils/tool_names_with_hash.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class ToolNames(Enum):
1010
ADD_SHAPE_TO_BOX = "add_shape_to_box_8d57"
1111
REMOVE_SHAPES_FROM_BOX = "remove_shapes_from_box_1feb"
1212
GET_SHAPES_FROM_BOX = "get_shapes_from_box_bdc8"
13-
INVERT_STRING = "InvertString_a160"
13+
INVERT_STRING = "InvertString_4cfe"
1414
LIST_FROM_WORD = "list_from_word_2309"
1515

1616

0 commit comments

Comments
 (0)