Skip to content

Commit e84a418

Browse files
pytorchbotdigantdesaiGasoonjia
authored
Add structured stats reporting and GPU memory tracking to Qwen3.5 MoE runner (#19228)
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #19190 by @digantdesai ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/digantdesai/53/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/digantdesai/53/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/digantdesai/51/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/digantdesai/53/orig @diff-train-skip-merge --------- Co-authored-by: Digant Desai <digantdesai@meta.com> Co-authored-by: Gasoonjia <gasoonjia@icloud.com>
1 parent 9c56093 commit e84a418

1 file changed

Lines changed: 113 additions & 16 deletions

File tree

examples/models/qwen3_5_moe/main.cpp

Lines changed: 113 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <executorch/runtime/platform/log.h>
1919
#include <pytorch/tokenizers/hf_tokenizer.h>
2020

21+
#include <algorithm>
2122
#include <cinttypes>
2223
#include <fstream>
2324
#include <string>
@@ -130,6 +131,17 @@ int main(int argc, char** argv) {
130131
return 1;
131132
}
132133

134+
// GPU memory: before load
135+
{
136+
size_t free = 0, total = 0;
137+
if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
138+
stats.gpu_total_bytes = total;
139+
stats.gpu_free_before_load_bytes = free;
140+
}
141+
}
142+
143+
stats.model_load_start_ms = llm::time_in_ms();
144+
133145
// Create Module with share_memory_arenas=true so prefill and decode
134146
// share mutable buffers (KV cache, conv_state, recurrent_state).
135147
std::vector<std::string> data_files;
@@ -212,11 +224,13 @@ int main(int argc, char** argv) {
212224

213225
stats.model_load_end_ms = llm::time_in_ms();
214226

215-
#ifdef EXECUTORCH_BUILD_CUDA
216-
// GPU memory after load
217-
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
218-
stats.gpu_free_after_load_bytes = gpu_free_bytes;
219-
#endif
227+
// GPU memory: after load
228+
{
229+
size_t free = 0, total = 0;
230+
if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
231+
stats.gpu_free_after_load_bytes = free;
232+
}
233+
}
220234

221235
// Get EOS ids
222236
auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
@@ -263,6 +277,9 @@ int main(int argc, char** argv) {
263277
from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
264278
#endif
265279

280+
stats.inference_start_ms = llm::time_in_ms();
281+
stats.num_prompt_tokens = num_prompt_tokens;
282+
266283
// ---------------------------------------------------------------
267284
// Prefill
268285
// ---------------------------------------------------------------
@@ -306,14 +323,14 @@ int main(int argc, char** argv) {
306323
cur_token = read_token(prefill_outputs[0].toTensor());
307324

308325
stats.prompt_eval_end_ms = llm::time_in_ms();
309-
326+
stats.first_token_ms = stats.prompt_eval_end_ms;
310327
double prefill_ms =
311328
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
312329
printf(
313330
"Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
314331
num_prompt_tokens,
315332
prefill_ms,
316-
num_prompt_tokens * 1000.0 / prefill_ms);
333+
num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND);
317334

318335
#ifdef EXECUTORCH_BUILD_CUDA
319336
// Synchronize CUDA device to ensure prefill's writes to shared mutable
@@ -380,24 +397,104 @@ int main(int argc, char** argv) {
380397
int64_t num_generated = pos - num_prompt_tokens;
381398
stats.num_generated_tokens = num_generated;
382399

400+
// GPU memory: after generate + peak usage
401+
{
402+
size_t free = 0, total = 0;
403+
if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
404+
stats.gpu_free_after_generate_bytes = free;
405+
size_t min_free = free;
406+
if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
407+
min_free = std::min(min_free, (size_t)stats.gpu_free_before_load_bytes);
408+
}
409+
if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
410+
min_free = std::min(min_free, (size_t)stats.gpu_free_after_load_bytes);
411+
}
412+
stats.gpu_peak_usage_mb = (double)(total - min_free) / 1024.0 / 1024.0;
413+
}
414+
}
415+
416+
printf("\n");
417+
383418
double decode_ms =
384419
(double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
420+
printf(
421+
"Prefill: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
422+
num_prompt_tokens,
423+
prefill_ms,
424+
num_prompt_tokens / prefill_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND);
385425
printf(
386426
"Decode: %" PRId64 " tokens in %.1f ms (%.1f tok/s)\n",
387427
num_generated,
388428
decode_ms,
389-
num_generated * 1000.0 / decode_ms);
429+
num_generated / decode_ms * stats.SCALING_FACTOR_UNITS_PER_SECOND);
390430
printf("Prompt tokens: %" PRId64 "\n", num_prompt_tokens);
391431

392-
#ifdef EXECUTORCH_BUILD_CUDA
393-
// GPU memory after generation
394-
cudaMemGetInfo(&gpu_free_bytes, &gpu_total_bytes);
395-
stats.gpu_free_after_generate_bytes = gpu_free_bytes;
396-
stats.gpu_peak_usage_mb =
397-
(stats.gpu_total_bytes - gpu_free_bytes) / 1024.0 / 1024.0;
398-
#endif
432+
// Structured stats report (matches stats.h print_report)
433+
printf("PyTorchObserver %s\n", llm::stats_to_json_string(stats).c_str());
434+
435+
double ms_per_s = stats.SCALING_FACTOR_UNITS_PER_SECOND;
399436

400-
llm::print_report(stats);
437+
double model_load_s =
438+
(double)(stats.model_load_end_ms - stats.model_load_start_ms) / ms_per_s;
439+
double inference_time_ms =
440+
(double)(stats.inference_end_ms - stats.inference_start_ms);
441+
double prompt_eval_ms =
442+
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
443+
double eval_ms = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
444+
double ttft_s =
445+
(double)(stats.first_token_ms - stats.inference_start_ms) / ms_per_s;
446+
double sampling_s = (double)stats.aggregate_sampling_time_ms / ms_per_s;
447+
448+
printf("\n");
449+
printf(
450+
"\tPrompt Tokens: %" PRId64 " Generated Tokens: %" PRId64 "\n",
451+
stats.num_prompt_tokens,
452+
stats.num_generated_tokens);
453+
printf("\tModel Load Time:\t\t%f (seconds)\n", model_load_s);
454+
printf(
455+
"\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n",
456+
inference_time_ms / ms_per_s,
457+
stats.num_generated_tokens / inference_time_ms * ms_per_s);
458+
printf(
459+
"\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n",
460+
prompt_eval_ms / ms_per_s,
461+
stats.num_prompt_tokens / prompt_eval_ms * ms_per_s);
462+
printf(
463+
"\t\tGenerated %" PRId64
464+
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)\n",
465+
stats.num_generated_tokens,
466+
eval_ms / ms_per_s,
467+
stats.num_generated_tokens / eval_ms * ms_per_s);
468+
printf("\tTime to first generated token:\t%f (seconds)\n", ttft_s);
469+
printf(
470+
"\tSampling time over %" PRId64 " tokens:\t%f (seconds)\n",
471+
stats.num_prompt_tokens + stats.num_generated_tokens,
472+
sampling_s);
473+
474+
// GPU memory reporting
475+
if (stats.gpu_total_bytes != static_cast<uint64_t>(-1)) {
476+
printf(
477+
"\tGPU total memory: %.2f MB\n",
478+
stats.gpu_total_bytes / 1024.0 / 1024.0);
479+
if (stats.gpu_free_before_load_bytes != static_cast<uint64_t>(-1)) {
480+
printf(
481+
"\tGPU free before load: %.2f MB\n",
482+
stats.gpu_free_before_load_bytes / 1024.0 / 1024.0);
483+
}
484+
if (stats.gpu_free_after_load_bytes != static_cast<uint64_t>(-1)) {
485+
printf(
486+
"\tGPU free after load: %.2f MB\n",
487+
stats.gpu_free_after_load_bytes / 1024.0 / 1024.0);
488+
}
489+
if (stats.gpu_free_after_generate_bytes != static_cast<uint64_t>(-1)) {
490+
printf(
491+
"\tGPU free after generate: %.2f MB\n",
492+
stats.gpu_free_after_generate_bytes / 1024.0 / 1024.0);
493+
}
494+
if (stats.gpu_peak_usage_mb >= 0.0) {
495+
printf("\tGPU peak usage: %.2f MB\n", stats.gpu_peak_usage_mb);
496+
}
497+
}
401498

402499
return 0;
403500
}

0 commit comments

Comments
 (0)