chore: response_format_integration_test (#46)

korotaav48 · Aleksei Korota · web-flow · commit fcb0124456ac · 2026-02-05T15:25:47.000+02:00
Co-authored-by: Aleksei Korota &lt;Aleksei_Korota@epam.com&gt;
diff --git a/src/tests/integration_tests/test_e2e.py b/src/tests/integration_tests/test_e2e.py
@@ -47,30 +47,30 @@
 # Individual test for GPT-5
 @pytest.mark.requires_session
 @pytest.mark.e2e
-@e2e_test(test_case=E2E_TEST_CASE, models=["gpt-5-2025-08-07"], runs=1)
+@e2e_test(test_case=E2E_TEST_CASE, model="gpt-5-2025-08-07", runs=1)
 def test_e2e_set_gpt5(client):
     pass
 
 
 # Individual test for GPT-4 Turbo
 @pytest.mark.requires_session
 @pytest.mark.e2e
-@e2e_test(test_case=E2E_TEST_CASE, models=["gpt-4.1-2025-04-14"], runs=1)
+@e2e_test(test_case=E2E_TEST_CASE, model="gpt-4.1-2025-04-14", runs=1)
 def test_e2e_set_gpt4_1(client):
     pass
 
 
 # Individual test for Claude 4.5
 # @pytest.mark.requires_session
 # @pytest.mark.e2e
-# @e2e_test(test_case=E2E_TEST_CASE, models=["anthropic.claude-v4-5-sonnet-v1"], runs=1)
+# @e2e_test(test_case=E2E_TEST_CASE, model="anthropic.claude-v4-5-sonnet-v1", runs=1)
 # def test_e2e_set_claude45(client):
 #     pass
 
 
 # Individual test for Gemini
 @pytest.mark.requires_session
 @pytest.mark.e2e
-@e2e_test(test_case=E2E_TEST_CASE, models=["gemini-2.5-pro"], runs=1)
+@e2e_test(test_case=E2E_TEST_CASE, model="gemini-2.5-pro", runs=1)
 def test_e2e_set_gemini(client):
     pass 
diff --git a/src/tests/integration_tests/test_runner/cache/cache_middleware.py b/src/tests/integration_tests/test_runner/cache/cache_middleware.py
@@ -3,7 +3,6 @@
 import json
 import logging
 import warnings
-from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 from urllib.parse import urlparse
@@ -28,7 +27,7 @@
 logger = logging.getLogger("__name__")
 
 
-AGENT_MODELS = ["gpt-4.1-2025-04-14", "gpt-5-2025-08-07", "claude-opus-4@20250514", "us.anthropic.claude-3-7-sonnet-20250219-v1", "gemini-2.5-pro", "anthropic.claude-v4-5-sonnet-v1"]  # Specify all models that should be not cached.
+AGENT_MODELS = ["gpt-4.1-2025-04-14", "gpt-5-2025-08-07", "gpt-5-mini-2025-08-07", "claude-opus-4@20250514", "us.anthropic.claude-3-7-sonnet-20250219-v1", "anthropic.claude-v4-5-sonnet-v1", "gemini-2.5-pro"]  # Specify all models that should be not cached.
 
 
 @dataclass
@@ -54,7 +53,6 @@ def _extract_host_port(url: str) -> str:
 
 class CacheMiddlewareApp(FastAPI):
     llm_cache: LlmCache
-    _background_tasks: List[asyncio.Task] = []
 
     def __init__(self, app_config: CacheMiddlewareConfig):
         self.target_url = app_config.dial_core_url
@@ -70,7 +68,7 @@ def __init__(self, app_config: CacheMiddlewareConfig):
             enable_cache=True,
         )
         self.used_cache_responses = set()
-        self._background_tasks = []
+        self._background_tasks: List[asyncio.Task] = []
 
         super().__init__()
         self.router = APIRouter()
@@ -115,8 +113,26 @@ async def close_resources(self):
                 logger.warning(f"Error waiting for tasks: {e}")
 
         if not self.http_client.is_closed:
-            await self.http_client.aclose()
-            logger.debug("HTTP client closed")
+            try:
+                # Check if event loop is still running before attempting to close
+                loop = asyncio.get_event_loop()
+                if loop.is_closed():
+                    logger.debug("HTTP client close skipped - event loop is already closed")
+                else:
+                    await self.http_client.aclose()
+                    logger.debug("HTTP client closed")
+
+                    # Wait a bit for any cleanup tasks to complete
+                    await asyncio.sleep(0.1)
+
+            except RuntimeError as e:
+                # Event loop may already be closing
+                if "Event loop is closed" in str(e):
+                    logger.debug("HTTP client close skipped - event loop is closing")
+                else:
+                    logger.warning(f"Error closing HTTP client: {e}")
+            except Exception as e:
+                logger.warning(f"Unexpected error closing HTTP client: {e}")
 
         logger.debug("CacheMiddlewareApp resources closed")
 
diff --git a/src/tests/integration_tests/test_runner/config.py b/src/tests/integration_tests/test_runner/config.py
@@ -19,6 +19,7 @@
 
 file_sets = {
     "integration": ["test_tool_set_chat_hub", "test_tool_set_py_interpreter", "test_mcp_tool"],
+    "integration_simple": ["test_tool_set_chat_hub"],
     "e2e": ["test_tool_set_chat_hub", "test_tool_set_py_interpreter"]
 }
 
@@ -47,7 +48,7 @@ class TestConfig:
     DEFAULT_MODEL = os.getenv("MODEL", "gpt4_1")  # "gpt4o", "claude35", "claude37"
     REMOTE_DIAL_API_KEY = SecretStr(os.getenv("REMOTE_DIAL_API_KEY", "dial_api_key"))
 
-    PY_INTERPRETER_URL = "https://dev-dial-core.staging.deltixhub.io"
+    PY_INTERPRETER_URL = os.getenv("PY_INTERPRETER_URL")
     PY_INTERPRETER_API_KEY = SecretStr(os.getenv("PY_INTERPRETER_API_KEY", REMOTE_DIAL_API_KEY))
 
     WARNING_MESSAGE = "No cached value found, this means that something was changed in the logic"
diff --git a/src/tests/integration_tests/test_runner/e2e_runner.py b/src/tests/integration_tests/test_runner/e2e_runner.py
@@ -91,12 +91,21 @@ async def start_server(refresh: bool, test_name: str, port: int, model: str, no_
 
     @staticmethod
     async def stop_server(server, cache_middleware_app):
+        # Signal server to shut down first
+        server.should_exit = True
+        logger.debug("Signaling server shutdown...")
 
-        await cache_middleware_app.close_resources()
+        # Give server a moment to start shutting down
+        await asyncio.sleep(0.5)
 
-        server.should_exit = True
-        logger.debug("Shutting down server...")
-        await asyncio.sleep(1)
+        # Then close app resources
+        try:
+            await cache_middleware_app.close_resources()
+        except Exception as e:
+            logger.warning(f"Error during resource cleanup: {e}")
+
+        # Give more time for cleanup to complete
+        await asyncio.sleep(1.5)
 
     @staticmethod
     async def get_attachment_url(dial_url: str, headers, attachment: Path):
@@ -140,13 +149,22 @@ async def execute_test_case(
                 message["custom_content"] = {"attachments": attachment_objects}
             messages.append(message)
             logger.debug(f"send {message} to {client.base_url}")
+
+            # Prepare request payload
+            request_payload = {
+                "model": TestDialCoreConfig.APP_DEPLOYMENT_V2_NAME,
+                "messages": messages,
+            }
+
+            # Add response_format if specified in test case
+            if test_case.response_format:
+                request_payload["response_format"] = test_case.response_format
+                logger.debug(f"Using response_format: {test_case.response_format}")
+
             response = client.post(
                 TestConfig.API_ENDPOINTS['CHAT_COMPLETIONS'],
                 headers=headers,
-                json={
-                    "model": TestDialCoreConfig.APP_DEPLOYMENT_V2_NAME,
-                    "messages": messages,
-                },
+                json=request_payload,
                 timeout=100.0,
             )
 
@@ -176,6 +194,16 @@ async def execute_test_case(
                 break
 
             logger.info(f"content:{response_message.content}")
+
+            # Validate response format if specified
+            if test_case.response_format:
+                format_failures = ResponseValidator.validate_json_schema_response(
+                    response_message.content, test_case.response_format, ts
+                )
+                if format_failures:
+                    ts.increment_failure(FailureReason.ANSWER)
+                    all_failures.extend(format_failures)
+
             # Check message answer if expected
             if test_message_data.answer:
                 failures = check_multiple_alternatives(
@@ -256,13 +284,18 @@ def e2e_test(
     test_case: TstCase = None,
     app_config_path: Path = None,
     model: str = None,
-    models: List[str] = None,
+    models_applicable_for_test: List[str] = None,
     refresh: bool = None,
     config_file_set: str = "e2e",
     runs: int = 3,
+    no_cache: bool = False,
 ):
     """
     Decorator for end-to-end tests.
+
+    Args:
+        no_cache: If True, bypass cache for this test. Can also be set globally via --no-cache CLI flag.
+                  CLI flag takes precedence over decorator parameter.
     """
 
     if refresh is None:
@@ -281,35 +314,44 @@ async def wrapper(request, recwarn, unique_port, *args, **kwargs):
                 f"{test_case.name if test_case else request.node.name}"
             )
 
-            execution_model_list = models if models else []
-
-            if len(execution_model_list) == 0:
-                if execution_model_list:
-                    execution_model_list.append(model)
-                elif request.config.getoption("--model"):
-                    execution_model_list.append(request.config.getoption("--model"))
+            model_to_use: str
+            if model:
+                model_to_use = model
+                logger.debug(f"Using model from parameter defined in test: {model_to_use}")
+            elif request.config.getoption("--model"):
+                cli_model = request.config.getoption("--model")
+                if models_applicable_for_test is None or len(
+                        models_applicable_for_test) == 0 or cli_model in models_applicable_for_test:
+                    model_to_use = cli_model
+                    logger.debug(f"Using model from CLI option: {model_to_use}")
                 else:
-                    execution_model_list.append(TestConfig.DEFAULT_MODEL)
-
-            for m in execution_model_list:
-                # Run the test multiple times according to the runs parameter
-                ts = TestStats(f"{test_name}[{m}]", 0, 0)
-                for run_index in range(runs):
-                    logger.info(f"Running test iteration {run_index + 1}/{runs}")
-                    failures = await prepare_and_execute_test(
-                        args,
-                        kwargs,
-                        recwarn,
-                        request,
-                        unique_port,
-                        execution_model=m,
-                        test_name=test_name,
-                        test_stats=ts,
-                        run_index=run_index,
-                    )
-                    all_runs_failures.extend(failures)
-                logger.info(ts)
-                report_test_stats(request.config, ts)
+                    logger.debug(
+                        f"Model '{cli_model}' is not in the applicable models list: {models_applicable_for_test}")
+                    pytest.skip(f"Model '{cli_model}' is not applicable for this test")
+            else:
+                logger.debug("No model specified")
+                pytest.fail("No model specified for test")
+
+
+
+               # Run the test multiple times according to the runs parameter
+            ts = TestStats(f"{test_name}[{model_to_use}]", 0, 0)
+            for run_index in range(runs):
+                logger.info(f"Running test iteration {run_index + 1}/{runs}")
+                failures = await prepare_and_execute_test(
+                    args,
+                    kwargs,
+                    recwarn,
+                    request,
+                    unique_port,
+                    execution_model=model_to_use,
+                    test_name=test_name,
+                    test_stats=ts,
+                    run_index=run_index,
+                )
+                all_runs_failures.extend(failures)
+            logger.info(ts)
+            report_test_stats(request.config, ts)
 
             # After all runs/models are complete, check if any failures occurred
             TestRunner.check_test_outcome(all_runs_failures)
@@ -334,13 +376,16 @@ async def prepare_and_execute_test(
 
             client = TestClient(app)
 
-            no_cache = bool(request.config.getoption("--no-cache", default=False))
+            # Combine CLI flag with decorator parameter - CLI takes precedence
+            cli_no_cache = bool(request.config.getoption("--no-cache", default=False))
+            effective_no_cache = cli_no_cache or no_cache
+
             task, server, middleware = await TestRunner.start_server(
                 model=execution_model,
                 test_name=test_name,
                 refresh=refresh,
                 port=unique_port,
-                no_cache=no_cache
+                no_cache=effective_no_cache
             )
             try:
                 run_failures, test_result = await execute_single_test_run(
@@ -362,13 +407,8 @@ async def prepare_and_execute_test(
 
             finally:
                 await TestRunner.stop_server(server, middleware)
-                # Properly close the client
-                if hasattr(client, "aclose"):
-                    await client.aclose()
-                # Shutdown async generators
-                loop = asyncio.get_event_loop()
-                if loop.is_running():
-                    await loop.shutdown_asyncgens()
+                # TestClient is synchronous and doesn't need async close
+                # Don't shutdown async generators while loop is running
 
         return wrapper
 
diff --git a/src/tests/integration_tests/test_runner/models.py b/src/tests/integration_tests/test_runner/models.py
@@ -177,13 +177,15 @@ def __init__(
         name: str,
         description: str,
         similarity_threshold: float = SimilarityThreshold.DEFAULT.value,
+        response_format: Dict[str, Any] | None = None,
     ):
         self.name = name
         self.description = description
         self.messages: List[UserMessage] = []
         self.mock_date = datetime.date.today()
         self.similarity_threshold = similarity_threshold
         self.py_interpreter_session_flow = False
+        self.response_format = response_format
 
     def add_user_message(
         self,
diff --git a/src/tests/integration_tests/test_runner/utils/tool_names_with_hash.py b/src/tests/integration_tests/test_runner/utils/tool_names_with_hash.py
@@ -10,7 +10,7 @@ class ToolNames(Enum):
     ADD_SHAPE_TO_BOX = "add_shape_to_box_8d57"
     REMOVE_SHAPES_FROM_BOX = "remove_shapes_from_box_1feb"
     GET_SHAPES_FROM_BOX = "get_shapes_from_box_bdc8"
-    INVERT_STRING = "InvertString_a160"
+    INVERT_STRING = "InvertString_4cfe"
     LIST_FROM_WORD = "list_from_word_2309"
 
 
diff --git a/src/tests/integration_tests/test_runner/validators.py b/src/tests/integration_tests/test_runner/validators.py
diff --git a/src/tests/integration_tests/test_simple_tool.py b/src/tests/integration_tests/test_simple_tool.py