Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/perf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,9 @@ ncnn_add_layer_perf(BinaryOp)
ncnn_add_layer_perf(Concat)
ncnn_add_layer_perf(Sigmoid)
ncnn_add_layer_perf(BatchNorm)

# SDPA perf tests (decode and prefill phases)
if(WITH_LAYER_sdpa)
ncnn_add_perf(sdpa_decode)
ncnn_add_perf(sdpa_prefill)
endif()
84 changes: 84 additions & 0 deletions tests/perf/perf_sdpa_decode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Copyright 2026 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "perfutil.h"

// decode phase: src_seqlen=1, with kv_cache and various past_seqlen
static void perf_sdpa_decode(int embed_dim, int num_heads, int num_groups, int past_seqlen)
{
const int src_seqlen = 1;
const int cur_seqlen = 1;
const int out_embed_dim = embed_dim;
const int dst_seqlen = past_seqlen + cur_seqlen;

ncnn::ParamDict pd;
pd.set(5, 0); // attn_mask = 0
pd.set(6, 0.f); // scale = 0 (default 1/sqrt(embed_dim))
pd.set(7, 1); // kv_cache = 1

std::vector<ncnn::Mat> weights(0);

// inputs: q, k, v, past_k, past_v
std::vector<ncnn::Mat> inputs(5);
inputs[0] = PerfMat(embed_dim, src_seqlen, num_heads); // q
inputs[1] = PerfMat(embed_dim, cur_seqlen, num_groups); // cur_k
inputs[2] = PerfMat(out_embed_dim, cur_seqlen, num_groups); // cur_v
inputs[3] = PerfMat(embed_dim, past_seqlen, num_groups); // past_k
inputs[4] = PerfMat(out_embed_dim, past_seqlen, num_groups); // past_v

perf_layer("SDPA", pd, weights, inputs, 3,
"embed=%d heads=%d groups=%d past=%d",
embed_dim, num_heads, num_groups, past_seqlen);
}

int main()
{
// typical LLM configurations for decode phase
// format: (embed_dim, num_heads, num_groups, past_seqlen)

// small model, various cache lengths
perf_sdpa_decode(128, 4, 4, 0);
perf_sdpa_decode(128, 4, 4, 128);
perf_sdpa_decode(128, 4, 4, 512);
perf_sdpa_decode(128, 4, 4, 1024);
perf_sdpa_decode(128, 4, 4, 2048);

// medium model
perf_sdpa_decode(512, 8, 8, 0);
perf_sdpa_decode(512, 8, 8, 128);
perf_sdpa_decode(512, 8, 8, 512);
perf_sdpa_decode(512, 8, 8, 1024);
perf_sdpa_decode(512, 8, 8, 2048);

// larger model (e.g., 7B scale)
perf_sdpa_decode(4096, 32, 32, 0);
perf_sdpa_decode(4096, 32, 32, 128);
perf_sdpa_decode(4096, 32, 32, 512);
perf_sdpa_decode(4096, 32, 32, 1024);
perf_sdpa_decode(4096, 32, 32, 2048);
perf_sdpa_decode(4096, 32, 32, 4096);
perf_sdpa_decode(4096, 32, 32, 8192);

// GQA/MQA configurations
// GQA: num_groups < num_heads
perf_sdpa_decode(4096, 32, 4, 128);
perf_sdpa_decode(4096, 32, 4, 512);
perf_sdpa_decode(4096, 32, 4, 1024);
perf_sdpa_decode(4096, 32, 4, 2048);
perf_sdpa_decode(4096, 32, 4, 4096);

// MQA: num_groups = 1
perf_sdpa_decode(4096, 32, 1, 128);
perf_sdpa_decode(4096, 32, 1, 512);
perf_sdpa_decode(4096, 32, 1, 1024);
perf_sdpa_decode(4096, 32, 1, 2048);
perf_sdpa_decode(4096, 32, 1, 4096);

// very large context lengths
perf_sdpa_decode(4096, 32, 32, 16384);
perf_sdpa_decode(4096, 32, 32, 32768);
perf_sdpa_decode(4096, 32, 4, 16384);
perf_sdpa_decode(4096, 32, 4, 32768);

return 0;
}
89 changes: 89 additions & 0 deletions tests/perf/perf_sdpa_prefill.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright 2026 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "perfutil.h"

// prefill phase: larger src_seqlen, no kv_cache (past_seqlen=0)
static void perf_sdpa_prefill(int embed_dim, int num_heads, int num_groups, int src_seqlen)
{
const int cur_seqlen = src_seqlen; // in prefill, cur_seqlen == src_seqlen
const int out_embed_dim = embed_dim;

ncnn::ParamDict pd;
pd.set(5, 0); // attn_mask = 0
pd.set(6, 0.f); // scale = 0 (default 1/sqrt(embed_dim))
pd.set(7, 0); // kv_cache = 0 (no cache in prefill)

std::vector<ncnn::Mat> weights(0);

// inputs: q, k, v
std::vector<ncnn::Mat> inputs(3);
inputs[0] = PerfMat(embed_dim, src_seqlen, num_heads); // q
inputs[1] = PerfMat(embed_dim, cur_seqlen, num_groups); // k
inputs[2] = PerfMat(out_embed_dim, cur_seqlen, num_groups); // v

perf_layer("SDPA", pd, weights, inputs, 1,
"embed=%d heads=%d groups=%d seqlen=%d",
embed_dim, num_heads, num_groups, src_seqlen);
}

int main()
{
// typical LLM configurations for prefill phase
// format: (embed_dim, num_heads, num_groups, src_seqlen)

// small model, various sequence lengths
perf_sdpa_prefill(128, 4, 4, 16);
perf_sdpa_prefill(128, 4, 4, 32);
perf_sdpa_prefill(128, 4, 4, 64);
perf_sdpa_prefill(128, 4, 4, 128);
perf_sdpa_prefill(128, 4, 4, 256);
perf_sdpa_prefill(128, 4, 4, 512);

// medium model
perf_sdpa_prefill(512, 8, 8, 16);
perf_sdpa_prefill(512, 8, 8, 32);
perf_sdpa_prefill(512, 8, 8, 64);
perf_sdpa_prefill(512, 8, 8, 128);
perf_sdpa_prefill(512, 8, 8, 256);
perf_sdpa_prefill(512, 8, 8, 512);
perf_sdpa_prefill(512, 8, 8, 1024);

// larger model (e.g., 7B scale)
perf_sdpa_prefill(4096, 32, 32, 16);
perf_sdpa_prefill(4096, 32, 32, 32);
perf_sdpa_prefill(4096, 32, 32, 64);
perf_sdpa_prefill(4096, 32, 32, 128);
perf_sdpa_prefill(4096, 32, 32, 256);
perf_sdpa_prefill(4096, 32, 32, 512);
perf_sdpa_prefill(4096, 32, 32, 1024);
perf_sdpa_prefill(4096, 32, 32, 2048);
perf_sdpa_prefill(4096, 32, 32, 4096);

// GQA/MQA configurations
// GQA: num_groups < num_heads
perf_sdpa_prefill(4096, 32, 4, 128);
perf_sdpa_prefill(4096, 32, 4, 256);
perf_sdpa_prefill(4096, 32, 4, 512);
perf_sdpa_prefill(4096, 32, 4, 1024);
perf_sdpa_prefill(4096, 32, 4, 2048);
perf_sdpa_prefill(4096, 32, 4, 4096);

// MQA: num_groups = 1
perf_sdpa_prefill(4096, 32, 1, 128);
perf_sdpa_prefill(4096, 32, 1, 256);
perf_sdpa_prefill(4096, 32, 1, 512);
perf_sdpa_prefill(4096, 32, 1, 1024);
perf_sdpa_prefill(4096, 32, 1, 2048);
perf_sdpa_prefill(4096, 32, 1, 4096);

// very long sequences
perf_sdpa_prefill(4096, 32, 32, 8192);
perf_sdpa_prefill(4096, 32, 32, 16384);
perf_sdpa_prefill(4096, 32, 32, 32768);
perf_sdpa_prefill(4096, 32, 4, 8192);
perf_sdpa_prefill(4096, 32, 4, 16384);
perf_sdpa_prefill(4096, 32, 4, 32768);

return 0;
}