khoj-ai · tomaszzmuda · Mar 18, 2026 · Mar 19, 2026 · debanjum · Mar 19, 2026
diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx
@@ -45,6 +45,8 @@ Restart your Khoj server after the first run to ensure all settings are applied
            - Set `KHOJ_ADMIN_PASSWORD`, `KHOJ_DJANGO_SECRET_KEY` (and optionally the `KHOJ_ADMIN_EMAIL`) to something secure. This allows you to customize Khoj later via the admin panel.
            - Set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `GEMINI_API_KEY` to your API key if you want to use OpenAI, Anthropic or Gemini commercial chat models respectively.
            - Uncomment `OPENAI_BASE_URL` to use [Ollama](/advanced/ollama?type=first-run&server=docker#setup) running on your host machine. Or set it to the URL of your OpenAI compatible API like vLLM or [LMStudio](/advanced/lmstudio).
+           - (Optional) Set `KHOJ_LLM_TIMEOUT_READ` to configure the read timeout for LLM API calls (default: `60` seconds). Useful for slower connections or larger models that take longer to respond.
+           - (Optional) Set `KHOJ_LLM_TIMEOUT_CONNECT` to configure the connection timeout for LLM API calls (default: `30` seconds).
         3. Start Khoj by running the following command in the same directory as your docker-compose.yml file.
            ```shell
            cd ~/.khoj
@@ -71,6 +73,8 @@ Restart your Khoj server after the first run to ensure all settings are applied
          - Set `KHOJ_ADMIN_PASSWORD`, `KHOJ_DJANGO_SECRET_KEY` (and optionally the `KHOJ_ADMIN_EMAIL`) to something secure. This allows you to customize Khoj later via the admin panel.
          - Set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `GEMINI_API_KEY` to your API key if you want to use OpenAI, Anthropic or Gemini commercial chat models respectively.
          - Uncomment `OPENAI_BASE_URL` to use [Ollama](/advanced/ollama) running on your host machine. Or set it to the URL of your OpenAI compatible API like vLLM or [LMStudio](/advanced/lmstudio).
+         - (Optional) Set `KHOJ_LLM_TIMEOUT_READ` to configure the read timeout for LLM API calls (default: `60` seconds). Useful for slower connections or larger models that take longer to respond.
+         - (Optional) Set `KHOJ_LLM_TIMEOUT_CONNECT` to configure the connection timeout for LLM API calls (default: `30` seconds).
       3. Start Khoj by running the following command in the same directory as your docker-compose.yml file.
          ```shell
          # Windows users should use their WSL2 terminal to run these commands
@@ -93,6 +97,8 @@ Restart your Khoj server after the first run to ensure all settings are applied
            - Set `KHOJ_ADMIN_PASSWORD`, `KHOJ_DJANGO_SECRET_KEY` (and optionally the `KHOJ_ADMIN_EMAIL`) to something secure. This allows you to customize Khoj later via the admin panel.
            - Set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `GEMINI_API_KEY` to your API key if you want to use OpenAI, Anthropic or Gemini commercial chat models respectively.
            - Uncomment `OPENAI_BASE_URL` to use [Ollama](/advanced/ollama) running on your host machine. Or set it to the URL of your OpenAI compatible API like vLLM or [LMStudio](/advanced/lmstudio).
+           - (Optional) Set `KHOJ_LLM_TIMEOUT_READ` to configure the read timeout for LLM API calls (default: `60` seconds). Useful for slower connections or larger models that take longer to respond.
+           - (Optional) Set `KHOJ_LLM_TIMEOUT_CONNECT` to configure the connection timeout for LLM API calls (default: `30` seconds).
         3. Start Khoj by running the following command in the same directory as your docker-compose.yml file.
            ```shell
            cd ~/.khoj

diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py
@@ -60,6 +60,22 @@
 MAX_COMPLETION_TOKENS = 16000
 
 
+def get_llm_timeout() -> httpx.Timeout:
+    """
+    Get the httpx.Timeout configuration for LLM API calls.
+
+    Supports environment variables:
+    - KHOJ_LLM_TIMEOUT_READ: Read timeout (default: 60)
+    - KHOJ_LLM_TIMEOUT_CONNECT: Connection timeout (default: 30)
+
+    Returns:
+        httpx.Timeout configured with appropriate values
+    """
+    connect_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_CONNECT", "30"))
+    read_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_READ", "60"))
-    read_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_READ", "60"))
+    default_read_timeout = 300 if is_local_api(api_base_url) else 60
+    read_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_READ", default_read_timeout))
-    read_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_READ", "60"))
+    default_read_timeout = 300 if is_local_api(api_base_url) else 60
+    read_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_READ", default_read_timeout))
+    return httpx.Timeout(connect_timeout, read=read_timeout)
+
+
 def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str:
     """Extract plain text from a message content suitable for Responses API instructions."""
     if content is None:
@@ -158,7 +174,6 @@ def completion_with_backoff(
     elif is_groq_api(api_base_url):
         model_kwargs["service_tier"] = "auto"
 
-    read_timeout = 300 if is_local_api(api_base_url) else 60
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
 
@@ -171,7 +186,7 @@ def completion_with_backoff(
         with client.beta.chat.completions.stream(
             messages=formatted_messages,  # type: ignore
             model=model_name,
-            timeout=httpx.Timeout(30, read=read_timeout),
+            timeout=get_llm_timeout(),
             **model_kwargs,
         ) as chat:
             for chunk in stream_processor(chat):
@@ -215,7 +230,7 @@ def completion_with_backoff(
         chunk = client.beta.chat.completions.parse(
             messages=formatted_messages,  # type: ignore
             model=model_name,
-            timeout=httpx.Timeout(30, read=read_timeout),
+            timeout=get_llm_timeout(),
             **model_kwargs,
         )
         aggregated_response = chunk.choices[0].message.content
@@ -360,7 +375,6 @@ async def chat_completion_with_backoff(
     elif is_groq_api(api_base_url):
         model_kwargs["service_tier"] = "auto"
 
-    read_timeout = 300 if is_local_api(api_base_url) else 60
     if os.getenv("KHOJ_LLM_SEED"):
         model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED"))
 
@@ -373,7 +387,7 @@ async def chat_completion_with_backoff(
         model=model_name,
         stream=stream,
         temperature=temperature,
-        timeout=httpx.Timeout(30, read=read_timeout),
+        timeout=get_llm_timeout(),
         **model_kwargs,
     )
     if not stream:
@@ -494,15 +508,13 @@ def responses_completion_with_backoff(
         model_kwargs.pop("top_p", None)
         model_kwargs.pop("stop", None)
 
-    read_timeout = 300 if is_local_api(api_base_url) else 60
-
     # Stream and aggregate
     model_response: OpenAIResponse = client.responses.create(
         input=formatted_messages,
         instructions=instructions,
         model=model_name,
         temperature=temperature,
-        timeout=httpx.Timeout(30, read=read_timeout),  # type: ignore
+        timeout=get_llm_timeout(),  # type: ignore
         store=False,
         **model_kwargs,
     )
@@ -607,8 +619,6 @@ async def responses_chat_completion_with_backoff(
         model_kwargs.pop("top_p", None)
         model_kwargs.pop("stop", None)
 
-    read_timeout = 300 if is_local_api(api_base_url) else 60
-
     aggregated_text = ""
     last_final: Optional[OpenAIResponse] = None
     # Tool call assembly buffers
@@ -621,7 +631,7 @@ async def responses_chat_completion_with_backoff(
         instructions=instructions,
         model=model_name,
         temperature=temperature,
-        timeout=httpx.Timeout(30, read=read_timeout),
+        timeout=get_llm_timeout(),
         **model_kwargs,
     ) as stream:  # type: ignore
         async for event in stream:  # type: ignore

diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py
@@ -1708,17 +1708,23 @@ async def delayed_flush():
                         await asyncio.sleep(BUFFER_FLUSH_INTERVAL)
                         # Check if there's still content to flush
                         chunks = "".join([chunk async for chunk in flush_message_buffer()])
-                        await websocket.send_text(chunks)
-                        await websocket.send_text(ChatEvent.END_EVENT.value)
+                        try:
+                            await websocket.send_text(chunks)
+                            await websocket.send_text(ChatEvent.END_EVENT.value)
+                        except RuntimeError:
+                            pass  # WebSocket already closed
 
                     # Flush buffer if no new messages arrive within debounce interval
                     message_buffer.timeout = asyncio.create_task(delayed_flush())
     except asyncio.CancelledError:
         logger.debug(f"Chat request cancelled for user {websocket.scope['user'].object.id}")
         raise
     except Exception as e:
-        await websocket.send_text(json.dumps({"error": "Internal server error"}))
         logger.error(f"Error processing chat request: {e}", exc_info=True)
+        try:
+            await websocket.send_text(json.dumps({"error": "Internal server error"}))
+        except RuntimeError:
+            pass  # WebSocket already closed
         raise