Skip to content

Commit 903bec3

Browse files
Dimon-Zhaoshemminger
authored andcommitted
net/nbl: improve mailbox exception handling
Key changes: 1. Replace simple 'acked' flag with a state machine to improve mailbox reliability. New states: IDLE, WAITING, ACKING, ACKED, TIMEOUT. 2. Validate message ID and type before processing acknowledgments to prevent incorrect ACK matching. 3. Handle timeout scenarios: discard ACKs received after timeout to prevent use of already released buffers. 4. Check mailbox status before sending: if the send mailbox descriptor is found to be in use or waiting for an ACK during Tx, the message will not be sent. Fixes: a1c5ffa ("net/nbl: add channel layer") Cc: stable@dpdk.org Signed-off-by: Dimon Zhao <dimon.zhao@nebula-matrix.com>
1 parent 43ed974 commit 903bec3

File tree

2 files changed

+85
-20
lines changed

2 files changed

+85
-20
lines changed

drivers/net/nbl/nbl_hw/nbl_channel.c

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ static int nbl_chan_init_tx_queue(union nbl_chan_info *chan_info)
2222
{
2323
struct nbl_chan_ring *txq = &chan_info->mailbox.txq;
2424
size_t size = chan_info->mailbox.num_txq_entries * sizeof(struct nbl_chan_tx_desc);
25+
int i;
2526

2627
txq->desc = nbl_alloc_dma_mem(&txq->desc_mem, size);
2728
if (!txq->desc) {
@@ -36,6 +37,10 @@ static int nbl_chan_init_tx_queue(union nbl_chan_info *chan_info)
3637
goto req_wait_queue_failed;
3738
}
3839

40+
for (i = 0; i < chan_info->mailbox.num_txq_entries; i++)
41+
rte_atomic_store_explicit(&chan_info->mailbox.wait[i].status, NBL_MBX_STATUS_IDLE,
42+
rte_memory_order_relaxed);
43+
3944
size = (u64)chan_info->mailbox.num_txq_entries * (u64)chan_info->mailbox.txq_buf_size;
4045
txq->buf = nbl_alloc_dma_mem(&txq->buf_mem, size);
4146
if (!txq->buf) {
@@ -253,24 +258,55 @@ static void nbl_chan_recv_ack_msg(void *priv, uint16_t srcid, uint16_t msgid,
253258
uint32_t ack_msgid;
254259
uint32_t ack_msgtype;
255260
uint32_t copy_len;
261+
int status = NBL_MBX_STATUS_WAITING;
256262

257263
chan_info = NBL_CHAN_MGT_TO_CHAN_INFO(chan_mgt);
258264
ack_msgtype = *payload;
259265
ack_msgid = *(payload + 1);
266+
267+
if (ack_msgid >= chan_mgt->chan_info->mailbox.num_txq_entries) {
268+
NBL_LOG(ERR, "Invalid msg id %u, msg type %u", ack_msgid, ack_msgtype);
269+
return;
270+
}
271+
260272
wait_head = &chan_info->mailbox.wait[ack_msgid];
273+
if (wait_head->msg_type != ack_msgtype) {
274+
NBL_LOG(ERR, "Invalid msg id %u, msg type %u, wait msg type %u",
275+
ack_msgid, ack_msgtype, wait_head->msg_type);
276+
return;
277+
}
278+
279+
if (!rte_atomic_compare_exchange_strong_explicit(&wait_head->status, &status,
280+
NBL_MBX_STATUS_ACKING,
281+
rte_memory_order_acq_rel,
282+
rte_memory_order_relaxed)) {
283+
NBL_LOG(ERR, "Invalid wait status %d msg id %u, msg type %u",
284+
wait_head->status, ack_msgid, ack_msgtype);
285+
return;
286+
}
287+
261288
wait_head->ack_err = *(payload + 2);
262289

263290
NBL_LOG(DEBUG, "recv ack, srcid:%u, msgtype:%u, msgid:%u, ack_msgid:%u,"
264-
" data_len:%u, ack_data_len:%u",
265-
srcid, ack_msgtype, msgid, ack_msgid, data_len, wait_head->ack_data_len);
291+
" data_len:%u, ack_data_len:%u, ack_err:%d",
292+
srcid, ack_msgtype, msgid, ack_msgid, data_len,
293+
wait_head->ack_data_len, wait_head->ack_err);
266294

267295
if (wait_head->ack_err >= 0 && (data_len > 3 * sizeof(uint32_t))) {
268296
/* the mailbox msg parameter structure may change */
297+
if (data_len - 3 * sizeof(uint32_t) != wait_head->ack_data_len)
298+
NBL_LOG(INFO, "payload_len does not match ack_len!,"
299+
" srcid:%u, msgtype:%u, msgid:%u, ack_msgid %u,"
300+
" data_len:%u, ack_data_len:%u",
301+
srcid, ack_msgtype, msgid,
302+
ack_msgid, data_len, wait_head->ack_data_len);
269303
copy_len = RTE_MIN(wait_head->ack_data_len, data_len - 3 * sizeof(uint32_t));
270-
memcpy(wait_head->ack_data, payload + 3, copy_len);
304+
if (copy_len)
305+
memcpy(wait_head->ack_data, payload + 3, copy_len);
271306
}
272307

273-
rte_atomic_store_explicit(&wait_head->acked, 1, rte_memory_order_release);
308+
rte_atomic_store_explicit(&wait_head->status, NBL_MBX_STATUS_ACKED,
309+
rte_memory_order_release);
274310
}
275311

276312
static void nbl_chan_recv_msg(struct nbl_channel_mgt *chan_mgt, void *data)
@@ -371,23 +407,34 @@ static uint16_t nbl_chan_update_txqueue(union nbl_chan_info *chan_info,
371407
{
372408
struct nbl_chan_ring *txq;
373409
struct nbl_chan_tx_desc *tx_desc;
410+
struct nbl_chan_waitqueue_head *wait_head;
374411
uint64_t pa;
375412
void *va;
376413
uint16_t next_to_use;
414+
int status;
415+
416+
if (arg_len > NBL_CHAN_BUF_LEN - sizeof(*tx_desc)) {
417+
NBL_LOG(ERR, "arg_len: %zu, too long!", arg_len);
418+
return 0xFFFF;
419+
}
377420

378421
txq = &chan_info->mailbox.txq;
379422
next_to_use = txq->next_to_use;
423+
424+
wait_head = &chan_info->mailbox.wait[next_to_use];
425+
status = rte_atomic_load_explicit(&wait_head->status, rte_memory_order_acquire);
426+
if (status != NBL_MBX_STATUS_IDLE && status != NBL_MBX_STATUS_TIMEOUT) {
427+
NBL_LOG(ERR, "too much inflight mailbox msg, msg id %u", next_to_use);
428+
return 0xFFFF;
429+
}
430+
380431
va = (u8 *)txq->buf + (u64)next_to_use * (u64)chan_info->mailbox.txq_buf_size;
381432
pa = txq->buf_mem.pa + (u64)next_to_use * (u64)chan_info->mailbox.txq_buf_size;
382433
tx_desc = NBL_CHAN_TX_DESC(txq, next_to_use);
383434

384435
tx_desc->dstid = dstid;
385436
tx_desc->msg_type = msg_type;
386437
tx_desc->msgid = next_to_use;
387-
if (arg_len > NBL_CHAN_BUF_LEN - sizeof(*tx_desc)) {
388-
NBL_LOG(ERR, "arg_len: %zu, too long!", arg_len);
389-
return -1;
390-
}
391438

392439
if (arg_len > NBL_CHAN_TX_DESC_EMBEDDED_DATA_LEN) {
393440
memcpy(va, arg, arg_len);
@@ -418,6 +465,7 @@ static int nbl_chan_send_msg(void *priv, struct nbl_chan_send_info *chan_send)
418465
uint16_t msgid;
419466
int ret;
420467
int retry_time = 0;
468+
int status = NBL_MBX_STATUS_WAITING;
421469

422470
if (chan_mgt->state)
423471
return -EIO;
@@ -431,8 +479,7 @@ static int nbl_chan_send_msg(void *priv, struct nbl_chan_send_info *chan_send)
431479

432480
if (msgid == 0xFFFF) {
433481
rte_spinlock_unlock(&chan_info->mailbox.txq_lock);
434-
NBL_LOG(ERR, "chan tx queue full, send msgtype:%u"
435-
" to dstid:%u failed",
482+
NBL_LOG(ERR, "chan tx queue full, send msgtype:%u to dstid:%u failed",
436483
chan_send->msg_type, chan_send->dstid);
437484
return -ECOMM;
438485
}
@@ -446,23 +493,34 @@ static int nbl_chan_send_msg(void *priv, struct nbl_chan_send_info *chan_send)
446493
wait_head = &chan_info->mailbox.wait[msgid];
447494
wait_head->ack_data = chan_send->resp;
448495
wait_head->ack_data_len = chan_send->resp_len;
449-
rte_atomic_store_explicit(&wait_head->acked, 0, rte_memory_order_relaxed);
450496
wait_head->msg_type = chan_send->msg_type;
451-
rte_wmb();
497+
rte_atomic_store_explicit(&wait_head->status, NBL_MBX_STATUS_WAITING,
498+
rte_memory_order_release);
452499
nbl_chan_kick_tx_ring(chan_mgt, chan_info);
453500
rte_spinlock_unlock(&chan_info->mailbox.txq_lock);
454501

455502
while (1) {
456-
if (rte_atomic_load_explicit(&wait_head->acked, rte_memory_order_acquire))
457-
return wait_head->ack_err;
503+
if (rte_atomic_load_explicit(&wait_head->status, rte_memory_order_acquire) ==
504+
NBL_MBX_STATUS_ACKED) {
505+
ret = wait_head->ack_err;
506+
rte_atomic_store_explicit(&wait_head->status, NBL_MBX_STATUS_IDLE,
507+
rte_memory_order_release);
508+
return ret;
509+
}
458510

459511
rte_delay_us(50);
460512
retry_time++;
461-
if (retry_time > NBL_CHAN_RETRY_TIMES)
462-
return -EIO;
513+
if (retry_time > NBL_CHAN_RETRY_TIMES &&
514+
rte_atomic_compare_exchange_strong_explicit(&wait_head->status,
515+
&status, NBL_MBX_STATUS_TIMEOUT,
516+
rte_memory_order_acq_rel,
517+
rte_memory_order_relaxed)) {
518+
NBL_LOG(ERR, "send msgtype:%u msgid %u to dstid:%u timeout",
519+
chan_send->msg_type, msgid, chan_send->dstid);
520+
break;
521+
}
463522
}
464-
465-
return 0;
523+
return -EIO;
466524
}
467525

468526
static int nbl_chan_send_ack(void *priv, struct nbl_chan_ack_info *chan_ack)

drivers/net/nbl/nbl_hw/nbl_channel.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ enum {
4141
NBL_MB_TX_QID = 1,
4242
};
4343

44+
enum {
45+
NBL_MBX_STATUS_IDLE = 0,
46+
NBL_MBX_STATUS_WAITING,
47+
NBL_MBX_STATUS_ACKING,
48+
NBL_MBX_STATUS_ACKED,
49+
NBL_MBX_STATUS_TIMEOUT,
50+
};
51+
4452
struct __rte_packed_begin nbl_chan_tx_desc {
4553
uint16_t flags;
4654
uint16_t srcid;
@@ -74,8 +82,7 @@ struct nbl_chan_ring {
7482

7583
struct nbl_chan_waitqueue_head {
7684
char *ack_data;
77-
RTE_ATOMIC(int)
78-
acked;
85+
RTE_ATOMIC(int) status;
7986
int ack_err;
8087
uint16_t ack_data_len;
8188
uint16_t msg_type;

0 commit comments

Comments
 (0)