@@ -94,10 +94,10 @@ def get_config():
9494 API_KEY = os .environ .get ("OPENAI_API_KEY" )
9595 base_url = server_config ['base_url' ]
9696 if base_url != "" :
97- default_client = OpenAI (api_key = API_KEY , base_url = base_url )
97+ default_client = OpenAI (api_key = API_KEY , base_url = base_url , http_client = http_client )
9898 logger .info (f"Created OpenAI client with base_url: { base_url } " )
9999 else :
100- default_client = OpenAI (api_key = API_KEY )
100+ default_client = OpenAI (api_key = API_KEY , http_client = http_client )
101101 logger .info ("Created OpenAI client without base_url" )
102102 elif os .environ .get ("AZURE_OPENAI_API_KEY" ):
103103 API_KEY = os .environ .get ("AZURE_OPENAI_API_KEY" )
@@ -189,6 +189,7 @@ def count_reasoning_tokens(text: str, tokenizer=None) -> int:
189189 'base_url' : '' ,
190190 'optillm_api_key' : '' ,
191191 'return_full_response' : False ,
192+ 'host' : '127.0.0.1' , # Default to localhost for security; use 0.0.0.0 to allow external connections
192193 'port' : 8000 ,
193194 'log' : 'info' ,
194195 'ssl_verify' : True ,
@@ -396,9 +397,9 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
396397 if approach == 'none' :
397398 # Use the request_config that was already prepared and passed to this function
398399 kwargs = request_config .copy () if request_config else {}
399-
400+
400401 # Remove items that are handled separately by the framework
401- kwargs . pop ( 'n' , None ) # n is handled by execute_n_times
402+ # Note: 'n' is NOT removed - the none_approach passes it to the client which handles multiple completions
402403 kwargs .pop ('stream' , None ) # stream is handled by proxy()
403404
404405 # Reconstruct original messages from system_prompt and initial_query
@@ -408,6 +409,7 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
408409 if initial_query :
409410 messages .append ({"role" : "user" , "content" : initial_query })
410411
412+ logger .debug (f"none_approach kwargs: { kwargs } " )
411413 response = none_approach (original_messages = messages , client = client , model = model , request_id = request_id , ** kwargs )
412414 # For none approach, we return the response and a token count of 0
413415 # since the full token count is already in the response
@@ -546,17 +548,29 @@ def execute_n_times(n: int, approaches, operation: str, system_prompt: str, init
546548 return responses , total_tokens
547549
548550def generate_streaming_response (final_response , model ):
549- # Yield the final response
551+ # Generate a unique response ID
552+ response_id = f"chatcmpl-{ int (time .time ()* 1000 )} "
553+ created = int (time .time ())
554+
555+ # Yield the final response with OpenAI-compatible format
550556 if isinstance (final_response , list ):
551557 for index , response in enumerate (final_response ):
558+ # First chunk includes role
552559 yield "data: " + json .dumps ({
553- "choices" : [{"delta" : {"content" : response }, "index" : index , "finish_reason" : "stop" }],
560+ "id" : response_id ,
561+ "object" : "chat.completion.chunk" ,
562+ "created" : created ,
554563 "model" : model ,
564+ "choices" : [{"delta" : {"role" : "assistant" , "content" : response }, "index" : index , "finish_reason" : "stop" }],
555565 }) + "\n \n "
556566 else :
567+ # First chunk includes role
557568 yield "data: " + json .dumps ({
558- "choices" : [{"delta" : {"content" : final_response }, "index" : 0 , "finish_reason" : "stop" }],
569+ "id" : response_id ,
570+ "object" : "chat.completion.chunk" ,
571+ "created" : created ,
559572 "model" : model ,
573+ "choices" : [{"delta" : {"role" : "assistant" , "content" : final_response }, "index" : 0 , "finish_reason" : "stop" }],
560574 }) + "\n \n "
561575
562576 # Yield the final message to indicate the stream has ended
@@ -987,6 +1001,7 @@ def parse_args():
9871001 ("--rstar-c" , "OPTILLM_RSTAR_C" , float , 1.4 , "Exploration constant for rStar algorithm" ),
9881002 ("--n" , "OPTILLM_N" , int , 1 , "Number of final responses to be returned" ),
9891003 ("--return-full-response" , "OPTILLM_RETURN_FULL_RESPONSE" , bool , False , "Return the full response including the CoT with <thinking> tags" ),
1004+ ("--host" , "OPTILLM_HOST" , str , "127.0.0.1" , "Host address to bind the server to (use 0.0.0.0 to allow external connections)" ),
9901005 ("--port" , "OPTILLM_PORT" , int , 8000 , "Specify the port to run the proxy" ),
9911006 ("--log" , "OPTILLM_LOG" , str , "info" , "Specify the logging level" , list (logging_levels .keys ())),
9921007 ("--launch-gui" , "OPTILLM_LAUNCH_GUI" , bool , False , "Launch a Gradio chat interface" ),
@@ -1263,7 +1278,8 @@ def process_batch_requests(batch_requests):
12631278 import gradio as gr
12641279 # Start server in a separate thread
12651280 import threading
1266- server_thread = threading .Thread (target = app .run , kwargs = {'host' : '0.0.0.0' , 'port' : port })
1281+ host = server_config ['host' ]
1282+ server_thread = threading .Thread (target = app .run , kwargs = {'host' : host , 'port' : port })
12671283 server_thread .daemon = True
12681284 server_thread .start ()
12691285
@@ -1310,12 +1326,12 @@ def chat_with_optillm(message, history):
13101326 description = f"Connected to OptILLM proxy at { base_url } "
13111327 )
13121328 demo .queue () # Enable queue to handle long operations properly
1313- demo .launch (server_name = "0.0.0.0" , share = False )
1329+ demo .launch (server_name = host , share = False )
13141330 except ImportError :
13151331 logger .error ("Gradio is required for GUI. Install it with: pip install gradio" )
13161332 return
13171333
1318- app .run (host = '0.0.0.0' , port = port )
1334+ app .run (host = server_config [ 'host' ] , port = port )
13191335
13201336if __name__ == "__main__" :
13211337 main ()
0 commit comments