-
Notifications
You must be signed in to change notification settings - Fork 7
Description
Thanks for open sourcing such great code. I'm getting confusing results when running tests: ttft(profile_info.first_token_time - profile_info.registration_time) is negative! I'm wondering if there's something wrong with my parameter settings. Here is my code, running on 8 24G 3090 GPUs,testing 8 pipeline parallelism and 8 tensor parallelism respectively
docker run --gpus all -it --rm --shm-size=256g \
-v ~/flexflow:/workspace \
-v /data/share/flexflowcache:/models \
-v ~/flexflow-newcode:/opt/conda/lib/python3.12/site-packages/flexflow \
-v /data/share:/modelss \
ghcr.io/flexflow/flexflow-cuda-12.2:latest
import flexflow.serve as ff
ff.init(
num_gpus=8,
memory_per_gpu=23000,
zero_copy_memory_per_node=230000,
tensor_parallelism_degree=8,
pipeline_parallelism_degree=1,
)
output_file = f"/workspace/result/test.json"
llm = ff.LLM(
model_name="meta-llama/Llama-2-70b-chat-hf",
cache_path="/models",
output_file=output_file
)
ssms=[]
ssm = ff.SSM(model_name="JackFram/llama-68m",
cache_path="/models",
output_file=output_file
)
ssms.append(ssm)
generation_config = ff.GenerationConfig(
do_sample=False, temperature=0.9, topp=0.8, topk=1
)
for ssm in ssms:
ssm.compile(generation_config,
max_requests_per_batch=1,
max_seq_length=512,
max_tokens_per_batch=512,
)
llm.compile(generation_config,
max_requests_per_batch = 1,
max_seq_length = 512,
max_tokens_per_batch = 512,
ssms=ssms
)
llm.start_server()
prompts = [""] #here is a list of some prompts
llm.generate(prompts)
llm.stop_server()
This is one of the outputs I get:
{
"benchmarking tokens": -1,
"input_tokens": "1,529,29879,24566,25580,29962,3532,14816,29903,6778,13,3492,526,263,8444,20255,29889,13,29966,829,14816,29903,6778,13,13,26029,29901,11249,3017,12569,3377,297,12569,363,15561,1454,13561,6509,5687,293,3534,292,773,6483,29899,29984,6509,29889,13,22550,278,5199,29915,29879,1139,2038,297,278,3030,310,278,3517,14983,518,29914,25580,29962",
"latency": 92884.467,
"max_length": 511,
"num_decoding_steps": 312,
"output_tokens": "29871,18585,29892,1244,29915,29879,263,4559,5132,12569,3377,773,360,1161,363,15561,1454,13561,6509,5687,293,3534,292,773,6483,29899,29984,6509,29901,13,28956,4691,13,5215,12569,13,5215,12569,29918,3221,29918,14036,408,270,617,13,5215,12569,29918,1420,29918,14036,408,3472,13,3166,12569,29889,22594,1053,10567,29892,10604,29892,4306,13,5215,12655,408,7442,13,5215,26110,408,15886,13,13,932,353,12569,29889,29928,1161,22168,978,1649,29897,13,13,932,29889,2680,353,3472,29889,12596,4197,13,1678,396,18527,13,1678,3472,29889,29950,29896,877,1123,262,1454,13561,29257,29068,293,1605,9382,360,1161,3377,5477,13,13,1678,396,10567,29879,363,6694,11266,16744,13,1678,3472,29889,29950,29906,877,5323,2827,26078,16744,5477,13,1678,3472,29889,4775,877,29931,799,1076,390,403,29901,5477,13,1678,3472,29889,4290,29898,333,2433,21891,29918,10492,742,995,29922,29900,29889,29900,29900,29896,511,13,1678,3472,29889,4775,877,4205,2798,383,7168,29901,5477,13,1678,3472,29889,4290,29898,333,2433,2218,2798,29918,19790,742,995,29922,29900,29889,29929,29945,511,13,1678,3472,29889,4775,877,4557,310,20981,2631,29901,5477,13,1678,3472,29889,4290,29898,333,2433,1022,275,2631,742,995,29922,29896,29900,29900,29900,511,13,13,1678,396,26101,2826,13,1678,3472,29889,3125,877,5323,262,8125,742,1178,2433,14968,29899,3092,5477,13,13,1678,396,10604,29879,13,1678,3472,29889,12596,29898,333,2433,4905,1495,13,2314,13,13,29937,22402,6939,740,304,7945,278,1904,746,278,2826,338,11484,13,29992,932,29889,14035,29898,13,1678,10604,877,4905,742,525,11991,5477,13,1678,10567,877,14968,29899,3092,742,525,29876,29918,3808,29879,1495,13,29897,13,1753,7945,29918,4299,29898,29876,29918,3808,29879,1125,13,1678,396,2538,300,278,1962,1933,13,1678,1962,353,3472,29889,12596,4197,2314,13,13,1678,396,28186,278,1904,773,6483,29899,29984,6509,13,1678,1904,353,15886,29889,3946,294,29889,9794,29889,16941,2556,4197,13,4706,15886,29889,3946,294,29889,29277,29889,29928,1947,29898,29953,29946,29892,26229,2433,2674,29884,742,1881,29918,12181,7607,29896,29900,29892,8243,13,4706,15886,29889,3946,294,29889,29277,29889,29928,1947,29898,29953,29946,29892,26229,2433,2674,29884,5477,13,4706,15886,29889,3946,294,29889,29277,29889,29928,1947,29898,29896,29897,13,268,2314",
"prompt": "<s> <s>[INST] <<SYS>>\nYou are a helpful assistant.\n<</SYS>>\n\nhuman:sample python dashboard in dash for reinforcement learning algorithmic trading using deep-Q learning.\nAnswer the human's question above in the context of the previous conversation [/INST]",
"prompt_length": 67,
"req_idx": 0,
"req_type": "inference",
"response": " Sure, here's a sample Python dashboard using Dash for reinforcement learning algorithmic trading using deep-Q learning:\n```python\nimport dash\nimport dash_core_components as dcc\nimport dash_html_components as html\nfrom dash.dependencies import Input, Output, State\nimport numpy as np\nimport tensorflow as tf\n\napp = dash.Dash(__name__)\n\napp.layout = html.Div([\n # Title\n html.H1('Reinforcement Learning Algorithmic Trading Dashboard'),\n\n # Inputs for training hyperparameters\n html.H2('Training Hyperparameters'),\n html.Label('Learning Rate:'),\n html.Input(id='learning_rate', value=0.001),\n html.Label('Discount Factor:'),\n html.Input(id='discount_factor', value=0.95),\n html.Label('Number of Episodes:'),\n html.Input(id='episodes', value=1000),\n\n # Training button\n html.Button('Train Model', id='train-button'),\n\n # Outputs\n html.Div(id='output')\n])\n\n# Define callback function to train the model when the button is clicked\[email protected](\n Output('output', 'children'),\n Input('train-button', 'n_clicks')\n)\ndef train_model(n_clicks):\n # Reset the output div\n output = html.Div([])\n\n # Train the model using deep-Q learning\n model = tf.keras.models.Sequential([\n tf.keras.layers.Dense(64, activation='relu', input_shape=(10,)),\n tf.keras.layers.Dense(64, activation='relu'),\n tf.keras.layers.Dense(1)\n ])",
"response_length": 444,
"ttft": -1441.065,
"warmup": false
},
Also, I would like to know how to test the per-token latency for multiple batch. it seems that by outputting the file, we can only get the per-token latency for each prompt, but not for the whole batch. should I add other parameters to be able to get the per-token latency for the whole Should I add other parameters so that I can get the per-token latency of the whole batch?