|
18 | 18 | #include <executorch/runtime/platform/log.h> |
19 | 19 | #include <pytorch/tokenizers/hf_tokenizer.h> |
20 | 20 |
|
| 21 | +#include <algorithm> |
21 | 22 | #include <cinttypes> |
22 | 23 | #include <fstream> |
23 | 24 | #include <string> |
@@ -130,6 +131,17 @@ int main(int argc, char** argv) { |
130 | 131 | return 1; |
131 | 132 | } |
132 | 133 |
|
| 134 | + // GPU memory: before load |
| 135 | + { |
| 136 | + size_t free = 0, total = 0; |
| 137 | + if (cudaMemGetInfo(&free, &total) == cudaSuccess) { |
| 138 | + stats.gpu_total_bytes = total; |
| 139 | + stats.gpu_free_before_load_bytes = free; |
| 140 | + } |
| 141 | + } |
| 142 | + |
| 143 | + stats.model_load_start_ms = llm::time_in_ms(); |
| 144 | + |
133 | 145 | // Create Module with share_memory_arenas=true so prefill and decode |
134 | 146 | // share mutable buffers (KV cache, conv_state, recurrent_state). |
135 | 147 | std::vector<std::string> data_files; |
@@ -212,11 +224,13 @@ int main(int argc, char** argv) { |
212 | 224 |
|
213 | 225 | stats.model_load_end_ms = llm::time_in_ms(); |
214 | 226 |
|
215 | | -#ifdef EXECUTORCH_BUILD_CUDA |
216 | | - // GPU memory after load |
217 | | - cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); |
218 | | - stats.gpu_free_after_load_bytes = gpu_free_bytes; |
219 | | -#endif |
| 227 | + // GPU memory: after load |
| 228 | + { |
| 229 | + size_t free = 0, total = 0; |
| 230 | + if (cudaMemGetInfo(&free, &total) == cudaSuccess) { |
| 231 | + stats.gpu_free_after_load_bytes = free; |
| 232 | + } |
| 233 | + } |
220 | 234 |
|
221 | 235 | // Get EOS ids |
222 | 236 | auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get()); |
@@ -263,6 +277,9 @@ int main(int argc, char** argv) { |
263 | 277 | from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float); |
264 | 278 | #endif |
265 | 279 |
|
| 280 | + stats.inference_start_ms = llm::time_in_ms(); |
| 281 | + stats.num_prompt_tokens = num_prompt_tokens; |
| 282 | + |
266 | 283 | // --------------------------------------------------------------- |
267 | 284 | // Prefill |
268 | 285 | // --------------------------------------------------------------- |
@@ -306,14 +323,14 @@ int main(int argc, char** argv) { |
306 | 323 | cur_token = read_token(prefill_outputs[0].toTensor()); |
307 | 324 |
|
308 | 325 | stats.prompt_eval_end_ms = llm::time_in_ms(); |
309 | | - |
| 326 | + stats.first_token_ms = stats.prompt_eval_end_ms; |
310 | 327 | double prefill_ms = |
311 | 328 | (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); |
312 | 329 | printf( |
313 | 330 | "Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", |
314 | 331 | num_prompt_tokens, |
315 | 332 | prefill_ms, |
316 | | - num_prompt_tokens * 1000.0 / prefill_ms); |
| 333 | + num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND); |
317 | 334 |
|
318 | 335 | #ifdef EXECUTORCH_BUILD_CUDA |
319 | 336 | // Synchronize CUDA device to ensure prefill's writes to shared mutable |
@@ -380,24 +397,104 @@ int main(int argc, char** argv) { |
380 | 397 | int64_t num_generated = pos - num_prompt_tokens; |
381 | 398 | stats.num_generated_tokens = num_generated; |
382 | 399 |
|
| 400 | + // GPU memory: after generate + peak usage |
| 401 | + { |
| 402 | + size_t free = 0, total = 0; |
| 403 | + if (cudaMemGetInfo(&free, &total) == cudaSuccess) { |
| 404 | + stats.gpu_free_after_generate_bytes = free; |
| 405 | + size_t min_free = free; |
| 406 | + if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) { |
| 407 | + min_free = std::min(min_free, (size_t)stats.gpu_free_before_load_bytes); |
| 408 | + } |
| 409 | + if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) { |
| 410 | + min_free = std::min(min_free, (size_t)stats.gpu_free_after_load_bytes); |
| 411 | + } |
| 412 | + stats.gpu_peak_usage_mb = (double)(total - min_free) / 1024.0 / 1024.0; |
| 413 | + } |
| 414 | + } |
| 415 | + |
| 416 | + printf("\n"); |
| 417 | + |
383 | 418 | double decode_ms = |
384 | 419 | (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); |
| 420 | + printf( |
| 421 | + "Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", |
| 422 | + num_prompt_tokens, |
| 423 | + prefill_ms, |
| 424 | + num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND); |
385 | 425 | printf( |
386 | 426 | "Decode: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n", |
387 | 427 | num_generated, |
388 | 428 | decode_ms, |
389 | | - num_generated * 1000.0 / decode_ms); |
| 429 | + num_generated / decode_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND); |
390 | 430 | printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens); |
391 | 431 |
|
392 | | -#ifdef EXECUTORCH_BUILD_CUDA |
393 | | - // GPU memory after generation |
394 | | - cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes); |
395 | | - stats.gpu_free_after_generate_bytes = gpu_free_bytes; |
396 | | - stats.gpu_peak_usage_mb = |
397 | | - (stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0; |
398 | | -#endif |
| 432 | + // Structured stats report (matches stats.h print_report) |
| 433 | + printf("PyTorchObserver %s\n", llm::stats_to_json_string(stats).c_str()); |
| 434 | + |
| 435 | + double ms_per_s = stats.SCALING_FACTOR_UNITS_PER_SECOND; |
399 | 436 |
|
400 | | - llm::print_report(stats); |
| 437 | + double model_load_s = |
| 438 | + (double)(stats.model_load_end_ms - stats.model_load_start_ms) / ms_per_s; |
| 439 | + double inference_time_ms = |
| 440 | + (double)(stats.inference_end_ms - stats.inference_start_ms); |
| 441 | + double prompt_eval_ms = |
| 442 | + (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); |
| 443 | + double eval_ms = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); |
| 444 | + double ttft_s = |
| 445 | + (double)(stats.first_token_ms - stats.inference_start_ms) / ms_per_s; |
| 446 | + double sampling_s = (double)stats.aggregate_sampling_time_ms / ms_per_s; |
| 447 | + |
| 448 | + printf("\n"); |
| 449 | + printf( |
| 450 | + "\tPrompt Tokens: %" PRId64 " Generated Tokens: %" PRId64 "\n", |
| 451 | + stats.num_prompt_tokens, |
| 452 | + stats.num_generated_tokens); |
| 453 | + printf("\tModel Load Time:\t\t%f (seconds)\n", model_load_s); |
| 454 | + printf( |
| 455 | + "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n", |
| 456 | + inference_time_ms / ms_per_s, |
| 457 | + stats.num_generated_tokens / inference_time_ms * ms_per_s); |
| 458 | + printf( |
| 459 | + "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n", |
| 460 | + prompt_eval_ms / ms_per_s, |
| 461 | + stats.num_prompt_tokens / prompt_eval_ms * ms_per_s); |
| 462 | + printf( |
| 463 | + "\t\tGenerated %" PRId64 |
| 464 | + " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n", |
| 465 | + stats.num_generated_tokens, |
| 466 | + eval_ms / ms_per_s, |
| 467 | + stats.num_generated_tokens / eval_ms * ms_per_s); |
| 468 | + printf("\tTime to first generated token:\t%f (seconds)\n", ttft_s); |
| 469 | + printf( |
| 470 | + "\tSampling time over %" PRId64 " tokens:\t%f (seconds)\n", |
| 471 | + stats.num_prompt_tokens + stats.num_generated_tokens, |
| 472 | + sampling_s); |
| 473 | + |
| 474 | + // GPU memory reporting |
| 475 | + if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) { |
| 476 | + printf( |
| 477 | + "\tGPU total memory: %.2f MB\n", |
| 478 | + stats.gpu_total_bytes / 1024.0 / 1024.0); |
| 479 | + if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) { |
| 480 | + printf( |
| 481 | + "\tGPU free before load: %.2f MB\n", |
| 482 | + stats.gpu_free_before_load_bytes / 1024.0 / 1024.0); |
| 483 | + } |
| 484 | + if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) { |
| 485 | + printf( |
| 486 | + "\tGPU free after load: %.2f MB\n", |
| 487 | + stats.gpu_free_after_load_bytes / 1024.0 / 1024.0); |
| 488 | + } |
| 489 | + if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) { |
| 490 | + printf( |
| 491 | + "\tGPU free after generate: %.2f MB\n", |
| 492 | + stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0); |
| 493 | + } |
| 494 | + if (stats.gpu_peak_usage_mb >= 0.0) { |
| 495 | + printf("\tGPU peak usage: %.2f MB\n", stats.gpu_peak_usage_mb); |
| 496 | + } |
| 497 | + } |
401 | 498 |
|
402 | 499 | return 0; |
403 | 500 | } |
0 commit comments