@@ -318,23 +318,28 @@ def make_argument_parser() -> argparse.ArgumentParser:
318318 type = str ,
319319 nargs = "+" ,
320320 choices = ["None" , "triton" , "fa3" , "flashinfer" ],
321- default = ["None" ],
322- help = """prefill attention kernel used in llm""" ,
321+ default = ["triton" ],
322+ help = """prefill attention kernel used in llm.
323+ None: automatically select backend based on current GPU device,
324+ not supported yet, will support in future""" ,
323325 )
324326 parser .add_argument (
325327 "--llm_decode_att_backend" ,
326328 type = str ,
327329 nargs = "+" ,
328330 choices = ["None" , "triton" , "fa3" , "flashinfer" ],
329- default = ["None" ],
330- help = """decode attention kernel used in llm""" ,
331+ default = ["triton" ],
332+ help = """decode attention kernel used in llm.
333+ None: automatically select backend based on current GPU device,
334+ not supported yet, will support in future""" ,
331335 )
332336 parser .add_argument (
333337 "--llm_kv_type" ,
334338 type = str ,
335- choices = ["None" , "int8kv" , "int4kv" , "fp8kv" ],
339+ choices = ["None" , "int8kv" , "int4kv" ],
336340 default = "None" ,
337- help = """kv type used in llm, None for dtype that llm used in config.json""" ,
341+ help = """kv type used in llm, None for dtype that llm used in config.json.
342+ fp8kv: not fully supported yet, will support in future""" ,
338343 )
339344 parser .add_argument (
340345 "--llm_kv_quant_group_size" ,
0 commit comments