ModelTC
diff --git a/‎…est_ppl_int8kv_flash_decoding_diverse.py‎ ‎…est_ppl_int8kv_flash_decoding_diverse.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse.py
Lines changed: 10 additions & 10 deletions b/‎…est_ppl_int8kv_flash_decoding_diverse.py‎ ‎…est_ppl_int8kv_flash_decoding_diverse.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎…_int8kv_flash_decoding_diverse_stage1.py‎ ‎…_int8kv_flash_decoding_diverse_stage1.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage1.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse_stage1.py
Lines changed: 4 additions & 2 deletions b/‎…_int8kv_flash_decoding_diverse_stage1.py‎ ‎…_int8kv_flash_decoding_diverse_stage1.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage1.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse_stage1.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎…_int8kv_flash_decoding_diverse_stage2.py‎ ‎…_int8kv_flash_decoding_diverse_stage2.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage2.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse_stage2.py
Lines changed: 4 additions & 1 deletion b/‎…_int8kv_flash_decoding_diverse_stage2.py‎ ‎…_int8kv_flash_decoding_diverse_stage2.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage2.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse_stage2.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎…_int8kv_flash_decoding_diverse_stage3.py‎ ‎…_int8kv_flash_decoding_diverse_stage3.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage3.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse_stage3.py
Lines changed: 7 additions & 2 deletions b/‎…_int8kv_flash_decoding_diverse_stage3.py‎ ‎…_int8kv_flash_decoding_diverse_stage3.py‎unit_tests/models/llama/test_ppl_int8kv_flash_decoding_diverse_stage3.py renamed to unit_tests/common/basemodel/triton_kernel/att/decode_att/int8kv/test_ppl_int8kv_flash_decoding_diverse_stage3.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎…ama/test_context_flashattention_nopad.py‎ ‎…tt/test_context_flashattention_nopad1.py‎unit_tests/models/llama/test_context_flashattention_nopad.py renamed to unit_tests/common/basemodel/triton_kernel/att/prefill_att/test_context_flashattention_nopad1.py
Lines changed: 15 additions & 12 deletions b/‎…ama/test_context_flashattention_nopad.py‎ ‎…tt/test_context_flashattention_nopad1.py‎unit_tests/models/llama/test_context_flashattention_nopad.py renamed to unit_tests/common/basemodel/triton_kernel/att/prefill_att/test_context_flashattention_nopad1.py
Lines changed: 15 additions & 12 deletions
diff --git a/‎…dels/deepseek2/test_destindex_copy_kv.py‎ ‎…el/kv_copy/test_mla_destindex_copy_kv.py‎unit_tests/models/deepseek2/test_destindex_copy_kv.py renamed to unit_tests/common/basemodel/triton_kernel/kv_copy/test_mla_destindex_copy_kv.py
Lines changed: 1 addition & 1 deletion b/‎…dels/deepseek2/test_destindex_copy_kv.py‎ ‎…el/kv_copy/test_mla_destindex_copy_kv.py‎unit_tests/models/deepseek2/test_destindex_copy_kv.py renamed to unit_tests/common/basemodel/triton_kernel/kv_copy/test_mla_destindex_copy_kv.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎…els/deepseek2/test_gqa_flash_decoding.py‎ ‎…tt/decode_att/test_gqa_flash_decoding.py‎unit_tests/models/deepseek2/test_gqa_flash_decoding.py renamed to unit_tests/common/basemodel/triton_kernel/mla_att/decode_att/test_gqa_flash_decoding.py
Lines changed: 4 additions & 7 deletions b/‎…els/deepseek2/test_gqa_flash_decoding.py‎ ‎…tt/decode_att/test_gqa_flash_decoding.py‎unit_tests/models/deepseek2/test_gqa_flash_decoding.py renamed to unit_tests/common/basemodel/triton_kernel/mla_att/decode_att/test_gqa_flash_decoding.py
Lines changed: 4 additions & 7 deletions
diff --git a/‎unit_tests/common/basemodel/triton_kernel/test_atomic_event.py‎
Lines changed: 4 additions & 4 deletions b/‎unit_tests/common/basemodel/triton_kernel/test_atomic_event.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎unit_tests/common/basemodel/triton_kernel/test_gen_sampling_params.py‎
Lines changed: 1 addition & 0 deletions b/‎unit_tests/common/basemodel/triton_kernel/test_gen_sampling_params.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎…models/deepseek2/test_repack_kv_index.py‎ ‎…el/triton_kernel/test_repack_kv_index.py‎unit_tests/models/deepseek2/test_repack_kv_index.py renamed to unit_tests/common/basemodel/triton_kernel/test_repack_kv_index.py
Lines changed: 1 addition & 1 deletion b/‎…models/deepseek2/test_repack_kv_index.py‎ ‎…el/triton_kernel/test_repack_kv_index.py‎unit_tests/models/deepseek2/test_repack_kv_index.py renamed to unit_tests/common/basemodel/triton_kernel/test_repack_kv_index.py
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,7 @@
 import pytest
+
+pytest.skip(reason="need install lightllmKernel", allow_module_level=True)
+
 import torch
 from lightllm.utils.light_utils import light_ops
 
@@ -21,15 +24,15 @@ class MockInferState:
     def __init__(
         self,
         batch_size,
-        max_len_in_batch,
+        max_kv_seq_len,
         req_to_tokens,
         b_req_idx,
         b_seq_len,
         b_shared_seq_len=None,
         b_mark_shared_group=None,
     ):
         self.batch_size = batch_size
-        self.max_len_in_batch = max_len_in_batch
+        self.max_kv_seq_len = max_kv_seq_len
         self.req_manager = MockReqManager(req_to_tokens)
         self.b_req_idx = b_req_idx
         self.b_seq_len = b_seq_len
@@ -44,10 +47,11 @@ def test_token_decode_attention_flash_decoding_diverse_vs_baseline(shared_seq_le
     测试 ppl_int8kv_flash_decoding_diverse 的 token_decode_attention_flash_decoding
     与 ppl_int8kv_flash_decoding (baseline) 的对比。
     """
-    from lightllm.models.llama.triton_kernel.ppl_int8kv_flash_decoding_diverse import (
+
+    from lightllm.common.basemodel.triton_kernel.att.decode_att.int8kv.ppl_int8kv_flash_decoding_diverse import (
         token_decode_attention_flash_decoding as diverse_attention,
     )
-    from lightllm.models.llama.triton_kernel.ppl_int8kv_flash_decoding import (
+    from lightllm.common.basemodel.triton_kernel.att.decode_att.int8kv.ppl_int8kv_flash_decoding import (
         token_decode_attention_flash_decoding as baseline_attention,
     )
 
@@ -87,7 +91,7 @@ def test_token_decode_attention_flash_decoding_diverse_vs_baseline(shared_seq_le
     # 创建 baseline 的 infer_state (不需要 b_shared_seq_len)
     baseline_infer_state = MockInferState(
         batch_size=batch_size,
-        max_len_in_batch=seq_len,
+        max_kv_seq_len=seq_len,
         req_to_tokens=req_to_tokens,
         b_req_idx=b_req_idx,
         b_seq_len=b_seq_len,
@@ -96,7 +100,7 @@ def test_token_decode_attention_flash_decoding_diverse_vs_baseline(shared_seq_le
     # 创建 diverse 的 infer_state
     diverse_infer_state = MockInferState(
         batch_size=batch_size,
-        max_len_in_batch=seq_len,
+        max_kv_seq_len=seq_len,
         req_to_tokens=req_to_tokens,
         b_req_idx=b_req_idx,
         b_seq_len=b_seq_len,
@@ -108,8 +112,6 @@ def test_token_decode_attention_flash_decoding_diverse_vs_baseline(shared_seq_le
     baseline_out = baseline_attention(
         q=q.clone(),
         infer_state=baseline_infer_state,
-        q_head_num=num_heads,
-        head_dim=head_dim,
         cache_k=cache_k,
         cache_k_scale=cache_k_scale,
         cache_v=cache_v,
@@ -120,8 +122,6 @@ def test_token_decode_attention_flash_decoding_diverse_vs_baseline(shared_seq_le
     diverse_out = diverse_attention(
         q=q.clone(),
         infer_state=diverse_infer_state,
-        q_head_num=num_heads,
-        head_dim=head_dim,
         cache_k=cache_k,
         cache_k_scale=cache_k_scale,
         cache_v=cache_v,
 
@@ -1,6 +1,8 @@
 import pytest
 import torch
-from lightllm.models.llama.triton_kernel.ppl_int8kv_flash_decoding_diverse_stage1 import flash_decode_stage1
+from lightllm.common.basemodel.triton_kernel.att.decode_att.int8kv.ppl_int8kv_flash_decoding_diverse_stage1 import (
+    flash_decode_stage1,
+)
 
 
 @pytest.fixture
@@ -81,7 +83,7 @@ def test_flash_decode_stage1_execution(setup_tensors):
     new_k = k.to(q.dtype)
     new_v = v.to(q.dtype)
 
-    from lightllm.models.llama.triton_kernel.gqa_flash_decoding_stage1 import (
+    from lightllm.common.basemodel.triton_kernel.att.decode_att.gqa.flash_decoding.gqa_flash_decoding_stage1 import (
         flash_decode_stage1 as gqa_flash_decode_stage1,
     )
 
 
@@ -1,4 +1,7 @@
 import pytest
+
+pytest.skip(reason="need install lightllmkernel", allow_module_level=True)
+
 import torch
 from lightllm.utils.light_utils import light_ops
 
@@ -94,7 +97,7 @@ def test_flash_decode_stage2_execution(shared_seq_len):
     b_seq_len = setup_tensors["b_seq_len"] - setup_tensors["b_shared_seq_len"]
     req_to_tokens = setup_tensors["Req_to_tokens"][:, setup_tensors["b_shared_seq_len"][0].item() :]
 
-    from lightllm.models.llama.triton_kernel.gqa_flash_decoding_stage1 import (
+    from lightllm.common.basemodel.triton_kernel.att.decode_att.gqa.flash_decoding.gqa_flash_decoding_stage1 import (
         flash_decode_stage1 as gqa_flash_decode_stage1,
     )
 
 
@@ -1,6 +1,8 @@
 import pytest
 import torch
-from lightllm.models.llama.triton_kernel.ppl_int8kv_flash_decoding_diverse_stage3 import flash_diverse_decode_stage3
+from lightllm.common.basemodel.triton_kernel.att.decode_att.int8kv.ppl_int8kv_flash_decoding_diverse_stage3 import (
+    flash_diverse_decode_stage3,
+)
 
 
 @pytest.mark.parametrize(
@@ -23,7 +25,10 @@ def test_flash_diverse_decode_stage3(batch, head_num, seq_len, shared_seq_len, b
     flash_diverse_decode_stage3(mid_out, mid_out_logexpsum, B_Seqlen, b_shared_seq_len, out, block_seq)
 
     true_out = torch.zeros_like(out)
-    from lightllm.models.llama.triton_kernel.flash_decoding_stage2 import flash_decode_stage2
+
+    from lightllm.common.basemodel.triton_kernel.att.decode_att.mha.flash_decoding.flash_decoding_stage2 import (
+        flash_decode_stage2,
+    )
 
     flash_decode_stage2(mid_out, mid_out_logexpsum, B_Seqlen, true_out, block_seq)
 
 
@@ -5,12 +5,11 @@
 import torch.nn.functional as F
 import flashinfer
 from lightllm.utils.log_utils import init_logger
-from lightllm.models.llama.triton_kernel.context_flashattention_nopad import (
+from lightllm.common.basemodel.triton_kernel.att.prefill_att.context_flashattention_nopad import (
     context_attention_fwd,
     context_attention_fwd_no_prompt_cache,
 )
 from lightllm.models.llama.infer_struct import LlamaInferStateInfo
-from lightllm.common.req_manager import ReqManager
 
 logger = init_logger(__name__)
 
@@ -54,25 +53,25 @@ def test_context_attention_fwd(batch, seqlen, q_heads, kv_heads, head_dim):
 
     infer_state = LlamaInferStateInfo()
     infer_state.batch_size = Z
-    infer_state.max_len_in_batch = N_CTX
+    infer_state.max_q_seq_len = N_CTX
     infer_state.total_token_num = Z * N_CTX
-    infer_state.req_manager = ReqManager(Z, N_CTX, None)
+    infer_state.req_manager = type("Object", (), {})()
     infer_state.req_manager.req_to_token_indexs = req_to_token_indexs
     infer_state.b_req_idx = b_req_idx
     infer_state.b_seq_len = b_seq_len
     infer_state.b_ready_cache_len = b_ready_cache_len
-    infer_state.b_start_loc = q_start_loc
+    infer_state.b_q_start_loc = q_start_loc
 
     context_attention_fwd(
         q,
         kv[:, :KV_HEADS, :],
         kv[:, KV_HEADS:, :],
         o,
         infer_state.b_req_idx,
-        infer_state.b_start_loc,
+        infer_state.b_q_start_loc,
         infer_state.b_seq_len,
         infer_state.b_ready_cache_len,
-        infer_state.max_len_in_batch,
+        infer_state.max_q_seq_len,
         infer_state.req_manager.req_to_token_indexs,
     )
 
@@ -127,7 +126,11 @@ def test_context_attention_fwd(batch, seqlen, q_heads, kv_heads, head_dim):
     "batch, seqlen, q_heads, kv_heads, head_dim",
     [
         (a, b, c, d, e)
-        for a in [1, 16, 32, 128, 512]
+        for a in [
+            1,
+            16,
+            32,
+        ]
         for b in [16, 32, 512, 1024]
         for c in [28]
         for d in [4]
@@ -149,18 +152,18 @@ def test_context_attention_fwd_no_prompt_cache(batch, seqlen, q_heads, kv_heads,
 
     infer_state = LlamaInferStateInfo()
     infer_state.batch_size = Z
-    infer_state.max_len_in_batch = N_CTX
+    infer_state.max_q_seq_len = N_CTX
     infer_state.b_seq_len = b_seq_len
-    infer_state.b_start_loc = b_start_loc
+    infer_state.b_q_start_loc = b_start_loc
 
     context_attention_fwd_no_prompt_cache(
         q,
         k,
         v,
         o,
-        infer_state.b_start_loc,
+        infer_state.b_q_start_loc,
         infer_state.b_seq_len,
-        infer_state.max_len_in_batch,
+        infer_state.max_q_seq_len,
     )
 
     head_dim = HEAD_DIM
 
@@ -1,6 +1,6 @@
 import torch
 import pytest
-from lightllm.models.deepseek2.triton_kernel.destindex_copy_kv import destindex_copy_kv
+from lightllm.common.basemodel.triton_kernel.kv_copy.mla_copy_kv import destindex_copy_kv
 from lightllm.utils.log_utils import init_logger
 import torch.nn.functional as F
 
 
@@ -5,9 +5,10 @@
 import torch.nn.functional as F
 import flashinfer
 from lightllm.utils.log_utils import init_logger
-from lightllm.models.deepseek2.triton_kernel.gqa_flash_decoding import gqa_token_decode_attention_flash_decoding
+from lightllm.common.basemodel.triton_kernel.mla_att.decode_att.gqa_flash_decoding import (
+    gqa_token_decode_attention_flash_decoding,
+)
 from lightllm.models.deepseek2.infer_struct import Deepseek2InferStateInfo
-from lightllm.common.req_manager import ReqManager
 
 logger = init_logger(__name__)
 
@@ -53,7 +54,7 @@ def test_gqa_flash_decoding(batch, seqlen, heads, nope_head, rope_head):
     infer_state.batch_size = Z
     infer_state.max_len_in_batch = N_CTX
     infer_state.total_token_num = Z * N_CTX
-    infer_state.req_manager = ReqManager(Z, N_CTX, None)
+    infer_state.req_manager = type("Object", (), {})()
     infer_state.req_manager.req_to_token_indexs = req_to_token_indexs
     infer_state.b_req_idx = b_req_idx
     infer_state.b_seq_len = b_seq_len
@@ -67,10 +68,6 @@ def test_gqa_flash_decoding(batch, seqlen, heads, nope_head, rope_head):
         kv_nope,
         kv_rope,
         infer_state,
-        H,
-        D_HEAD,
-        ROPE_HEAD,
-        D_HEAD,
         sm_scale,
         o,
     )
 
@@ -18,10 +18,10 @@ def test_add_in_place():
     assert input.item() == 3, "最终值应为 3"
 
 
-@pytest.mark.timeout(2)
-def test_wait_timeout():
-    input = torch.zeros((1,), device="cuda", dtype=torch.int32)
-    wait_value(input, 4)
+# @pytest.mark.timeout(2)
+# def test_wait_timeout():
+#     input = torch.zeros((1,), device="cuda", dtype=torch.int32)
+#     wait_value(input, 4)
 
 
 if __name__ == "__main__":
 
@@ -25,6 +25,7 @@ def test_token_id_counter():
     for _ in range(100):
         token_id_counter(prompt_ids=test_prompt_ids, out_token_id_counter=test_token_id_counter)
     end_event.record()
+    end_event.synchronize()
     logger.info(f"test_token_id_count cost time: {start_event.elapsed_time(end_event)} ms")
 
 
 
@@ -1,7 +1,7 @@
 import torch
 import pytest
 from lightllm.utils.log_utils import init_logger
-from lightllm.models.deepseek2.triton_kernel.repack_kv_index import repack_kv_index
+from lightllm.common.basemodel.triton_kernel.repack_kv_index import repack_kv_index
 
 logger = init_logger(__name__)