Skip to content

Commit e29b005

Browse files
[Others] Clean code && remove GPU sync code (#5548)
1 parent 867803a commit e29b005

File tree

2 files changed

+40
-44
lines changed

2 files changed

+40
-44
lines changed

fastdeploy/worker/gpu_model_runner.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
637637
batch_pooling_params = []
638638
for i in range(req_len):
639639
request = req_dicts[i]
640+
# assert isinstance(request, Request)
640641
idx = request.idx
641642

642643
if hasattr(request, "pooling_params") and request.pooling_params is not None:
@@ -655,14 +656,14 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
655656
logits_info, schemata_key = self._init_logits_processor(request)
656657
request.schemata_key = schemata_key
657658

658-
if self.scheduler_config.splitwise_role == "decode":
659-
if (
660-
hasattr(request, "prefill_end_index")
661-
and hasattr(request, "prompt_token_ids")
662-
and request.prefill_end_index > len(request.prompt_token_ids)
663-
):
664-
if hasattr(request, "output_token_ids"):
665-
prefill_tokens.extend(request.output_token_ids)
659+
if (
660+
self.scheduler_config.splitwise_role == "decode"
661+
and hasattr(request, "prefill_end_index")
662+
and hasattr(request, "prompt_token_ids")
663+
and request.prefill_end_index > len(request.prompt_token_ids)
664+
and hasattr(request, "output_token_ids")
665+
):
666+
prefill_tokens.extend(request.output_token_ids)
666667

667668
prefill_start_index = request.prefill_start_index
668669
prefill_end_index = request.prefill_end_index
@@ -784,12 +785,12 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
784785

785786
if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0:
786787
bad_words_len = len(request.get("bad_words_token_ids"))
787-
self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len
788+
self.share_inputs["bad_tokens_len"][idx] = bad_words_len
788789
self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array(
789790
request.get("bad_words_token_ids"), dtype="int64"
790791
)
791792
else:
792-
self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1
793+
self.share_inputs["bad_tokens_len"][idx] = 1
793794
self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64")
794795

795796
if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
@@ -1007,12 +1008,12 @@ def get_attr_from_request(request, attr, default_value=None):
10071008

10081009
if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0:
10091010
bad_words_len = len(request.get("bad_words_token_ids"))
1010-
self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len
1011+
self.share_inputs["bad_tokens_len"][idx] = bad_words_len
10111012
self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array(
10121013
request.get("bad_words_token_ids"), dtype="int64"
10131014
)
10141015
else:
1015-
self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1
1016+
self.share_inputs["bad_tokens_len"][idx] = 1
10161017
self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64")
10171018

10181019
if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
@@ -1217,7 +1218,7 @@ def _init_share_inputs(self, max_num_seqs: int):
12171218
self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64")
12181219

12191220
self.share_inputs["bad_tokens"] = paddle.full([max_num_seqs, self.model_config.vocab_size], -1, dtype="int64")
1220-
self.share_inputs["bad_tokens_len"] = paddle.full([max_num_seqs], 1, dtype="int64")
1221+
self.share_inputs["bad_tokens_len"] = [-1] * max_num_seqs
12211222
self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
12221223
self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool")
12231224
self.share_inputs["is_chunk_step"] = paddle.full([max_num_seqs], False, dtype="bool").cpu()
@@ -1447,7 +1448,7 @@ def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None:
14471448
self.share_inputs["output_padding_offset"].copy_(output_padding_offset, False)
14481449

14491450
# Update bad tokens len
1450-
max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy())
1451+
max_bad_tokens_len = max(self.share_inputs["bad_tokens_len"])
14511452

14521453
# Initialize forward meta data
14531454
self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)

tests/layers/test_attention_layer.py

Lines changed: 25 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ def create_forward_meta(
270270
partial_rotary_factor=fd_config.model_config.partial_rotary_factor,
271271
)
272272

273-
input_ids = paddle.zeros([batch_size, seq_len if mode == ForwardMode.EXTEND else 1], dtype="int64")
273+
input_ids = paddle.zeros([batch_size, fd_config.model_config.max_model_len], dtype="int64")
274274
token_num = np.sum(seq_lens_this_time)
275275
ids_remove_padding, batch_id_per_token, cu_seqlens_q, cu_seqlens_k = get_padding_offset(
276276
input_ids, seq_lens_this_time, token_num
@@ -302,27 +302,22 @@ def test_decode_performance_with_prefill(self):
302302
# Test parameters
303303
test_steps = 100
304304

305-
# prefill_batch_size = 1
306-
# prefill_seq_len = 4096
305+
prefill_batch_size = 1
306+
prefill_seq_len = 2048
307307

308-
# prefill_hidden_states = paddle.randn(
309-
# [prefill_batch_size * prefill_seq_len, self.fd_config.model_config.hidden_size],
310-
# dtype=act_tensor_dtype,
311-
# )
312-
313-
# forward_meta = self.create_forward_meta(
314-
# batch_size=prefill_batch_size,
315-
# seq_len=prefill_seq_len,
316-
# mode=ForwardMode.EXTEND,
317-
# fd_config=self.fd_config,
318-
# attn_backend=self.attn_backend,
319-
# cache_quant_type_str=self.cache_quant_type_str,
320-
# )
308+
forward_meta, prefill_hidden_states = self.create_forward_meta(
309+
batch_size=prefill_batch_size,
310+
seq_len=prefill_seq_len,
311+
mode=ForwardMode.EXTEND,
312+
fd_config=self.fd_config,
313+
attn_backend=self.attn_backend,
314+
cache_quant_type_str=self.cache_quant_type_str,
315+
)
321316

322-
# self.attn_backend.init_attention_metadata(forward_meta)
323-
# self.attn_forward(forward_meta, prefill_hidden_states)
317+
self.attn_backend.init_attention_metadata(forward_meta)
318+
self.attn_forward(forward_meta, prefill_hidden_states)
324319

325-
# paddle.device.synchronize()
320+
paddle.device.synchronize()
326321

327322
# import paddle.profiler as profiler
328323
# p = profiler.Profiler(
@@ -332,22 +327,22 @@ def test_decode_performance_with_prefill(self):
332327
# p.start()
333328
# p.step()
334329

335-
# start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
336-
# end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
337-
# for i in range(test_steps):
338-
# start_events[i].record()
339-
340-
# self.attn_forward(forward_meta, prefill_hidden_states)
330+
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
331+
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
332+
for i in range(test_steps):
333+
start_events[i].record()
341334

342-
# end_events[i].record()
343-
# paddle.device.synchronize()
335+
self.attn_forward(forward_meta, prefill_hidden_states)
344336

345-
# times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
346-
# print(times[-5:])
347-
# return
337+
end_events[i].record()
338+
paddle.device.synchronize()
348339

340+
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
341+
print(times[-5:])
349342
# p.stop()
350343

344+
return
345+
351346
# p = profiler.Profiler(
352347
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
353348
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),

0 commit comments

Comments
 (0)