diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx index 02f0eadad..97bde4ff5 100644 --- a/documentation/docs/get-started/setup.mdx +++ b/documentation/docs/get-started/setup.mdx @@ -45,6 +45,8 @@ Restart your Khoj server after the first run to ensure all settings are applied - Set `KHOJ_ADMIN_PASSWORD`, `KHOJ_DJANGO_SECRET_KEY` (and optionally the `KHOJ_ADMIN_EMAIL`) to something secure. This allows you to customize Khoj later via the admin panel. - Set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `GEMINI_API_KEY` to your API key if you want to use OpenAI, Anthropic or Gemini commercial chat models respectively. - Uncomment `OPENAI_BASE_URL` to use [Ollama](/advanced/ollama?type=first-run&server=docker#setup) running on your host machine. Or set it to the URL of your OpenAI compatible API like vLLM or [LMStudio](/advanced/lmstudio). + - (Optional) Set `KHOJ_LLM_TIMEOUT_READ` to configure the read timeout for LLM API calls (default: `60` seconds). Useful for slower connections or larger models that take longer to respond. + - (Optional) Set `KHOJ_LLM_TIMEOUT_CONNECT` to configure the connection timeout for LLM API calls (default: `30` seconds). 3. Start Khoj by running the following command in the same directory as your docker-compose.yml file. ```shell cd ~/.khoj @@ -71,6 +73,8 @@ Restart your Khoj server after the first run to ensure all settings are applied - Set `KHOJ_ADMIN_PASSWORD`, `KHOJ_DJANGO_SECRET_KEY` (and optionally the `KHOJ_ADMIN_EMAIL`) to something secure. This allows you to customize Khoj later via the admin panel. - Set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `GEMINI_API_KEY` to your API key if you want to use OpenAI, Anthropic or Gemini commercial chat models respectively. - Uncomment `OPENAI_BASE_URL` to use [Ollama](/advanced/ollama) running on your host machine. Or set it to the URL of your OpenAI compatible API like vLLM or [LMStudio](/advanced/lmstudio). + - (Optional) Set `KHOJ_LLM_TIMEOUT_READ` to configure the read timeout for LLM API calls (default: `60` seconds). Useful for slower connections or larger models that take longer to respond. + - (Optional) Set `KHOJ_LLM_TIMEOUT_CONNECT` to configure the connection timeout for LLM API calls (default: `30` seconds). 3. Start Khoj by running the following command in the same directory as your docker-compose.yml file. ```shell # Windows users should use their WSL2 terminal to run these commands @@ -93,6 +97,8 @@ Restart your Khoj server after the first run to ensure all settings are applied - Set `KHOJ_ADMIN_PASSWORD`, `KHOJ_DJANGO_SECRET_KEY` (and optionally the `KHOJ_ADMIN_EMAIL`) to something secure. This allows you to customize Khoj later via the admin panel. - Set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, or `GEMINI_API_KEY` to your API key if you want to use OpenAI, Anthropic or Gemini commercial chat models respectively. - Uncomment `OPENAI_BASE_URL` to use [Ollama](/advanced/ollama) running on your host machine. Or set it to the URL of your OpenAI compatible API like vLLM or [LMStudio](/advanced/lmstudio). + - (Optional) Set `KHOJ_LLM_TIMEOUT_READ` to configure the read timeout for LLM API calls (default: `60` seconds). Useful for slower connections or larger models that take longer to respond. + - (Optional) Set `KHOJ_LLM_TIMEOUT_CONNECT` to configure the connection timeout for LLM API calls (default: `30` seconds). 3. Start Khoj by running the following command in the same directory as your docker-compose.yml file. ```shell cd ~/.khoj diff --git a/src/khoj/processor/conversation/openai/utils.py b/src/khoj/processor/conversation/openai/utils.py index 3ebb2e62b..2fe76a9fb 100644 --- a/src/khoj/processor/conversation/openai/utils.py +++ b/src/khoj/processor/conversation/openai/utils.py @@ -60,6 +60,22 @@ MAX_COMPLETION_TOKENS = 16000 +def get_llm_timeout() -> httpx.Timeout: + """ + Get the httpx.Timeout configuration for LLM API calls. + + Supports environment variables: + - KHOJ_LLM_TIMEOUT_READ: Read timeout (default: 60) + - KHOJ_LLM_TIMEOUT_CONNECT: Connection timeout (default: 30) + + Returns: + httpx.Timeout configured with appropriate values + """ + connect_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_CONNECT", "30")) + read_timeout = float(os.getenv("KHOJ_LLM_TIMEOUT_READ", "60")) + return httpx.Timeout(connect_timeout, read=read_timeout) + + def _extract_text_for_instructions(content: Union[str, List, Dict, None]) -> str: """Extract plain text from a message content suitable for Responses API instructions.""" if content is None: @@ -158,7 +174,6 @@ def completion_with_backoff( elif is_groq_api(api_base_url): model_kwargs["service_tier"] = "auto" - read_timeout = 300 if is_local_api(api_base_url) else 60 if os.getenv("KHOJ_LLM_SEED"): model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED")) @@ -171,7 +186,7 @@ def completion_with_backoff( with client.beta.chat.completions.stream( messages=formatted_messages, # type: ignore model=model_name, - timeout=httpx.Timeout(30, read=read_timeout), + timeout=get_llm_timeout(), **model_kwargs, ) as chat: for chunk in stream_processor(chat): @@ -215,7 +230,7 @@ def completion_with_backoff( chunk = client.beta.chat.completions.parse( messages=formatted_messages, # type: ignore model=model_name, - timeout=httpx.Timeout(30, read=read_timeout), + timeout=get_llm_timeout(), **model_kwargs, ) aggregated_response = chunk.choices[0].message.content @@ -360,7 +375,6 @@ async def chat_completion_with_backoff( elif is_groq_api(api_base_url): model_kwargs["service_tier"] = "auto" - read_timeout = 300 if is_local_api(api_base_url) else 60 if os.getenv("KHOJ_LLM_SEED"): model_kwargs["seed"] = int(os.getenv("KHOJ_LLM_SEED")) @@ -373,7 +387,7 @@ async def chat_completion_with_backoff( model=model_name, stream=stream, temperature=temperature, - timeout=httpx.Timeout(30, read=read_timeout), + timeout=get_llm_timeout(), **model_kwargs, ) if not stream: @@ -494,15 +508,13 @@ def responses_completion_with_backoff( model_kwargs.pop("top_p", None) model_kwargs.pop("stop", None) - read_timeout = 300 if is_local_api(api_base_url) else 60 - # Stream and aggregate model_response: OpenAIResponse = client.responses.create( input=formatted_messages, instructions=instructions, model=model_name, temperature=temperature, - timeout=httpx.Timeout(30, read=read_timeout), # type: ignore + timeout=get_llm_timeout(), # type: ignore store=False, **model_kwargs, ) @@ -607,8 +619,6 @@ async def responses_chat_completion_with_backoff( model_kwargs.pop("top_p", None) model_kwargs.pop("stop", None) - read_timeout = 300 if is_local_api(api_base_url) else 60 - aggregated_text = "" last_final: Optional[OpenAIResponse] = None # Tool call assembly buffers @@ -621,7 +631,7 @@ async def responses_chat_completion_with_backoff( instructions=instructions, model=model_name, temperature=temperature, - timeout=httpx.Timeout(30, read=read_timeout), + timeout=get_llm_timeout(), **model_kwargs, ) as stream: # type: ignore async for event in stream: # type: ignore diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 79339c829..12dade41c 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -1708,8 +1708,11 @@ async def delayed_flush(): await asyncio.sleep(BUFFER_FLUSH_INTERVAL) # Check if there's still content to flush chunks = "".join([chunk async for chunk in flush_message_buffer()]) - await websocket.send_text(chunks) - await websocket.send_text(ChatEvent.END_EVENT.value) + try: + await websocket.send_text(chunks) + await websocket.send_text(ChatEvent.END_EVENT.value) + except RuntimeError: + pass # WebSocket already closed # Flush buffer if no new messages arrive within debounce interval message_buffer.timeout = asyncio.create_task(delayed_flush()) @@ -1717,8 +1720,11 @@ async def delayed_flush(): logger.debug(f"Chat request cancelled for user {websocket.scope['user'].object.id}") raise except Exception as e: - await websocket.send_text(json.dumps({"error": "Internal server error"})) logger.error(f"Error processing chat request: {e}", exc_info=True) + try: + await websocket.send_text(json.dumps({"error": "Internal server error"})) + except RuntimeError: + pass # WebSocket already closed raise