From 54c660ea40b6233a738493df03607c595457bcd2 Mon Sep 17 00:00:00 2001 From: Arne Redlich <405850+redlicha@users.noreply.github.com> Date: Mon, 11 Nov 2024 11:38:51 +0100 Subject: [PATCH 1/3] list.h: introduce list_next_entry_or_null Signed-off-by: Arne Redlich <405850+redlicha@users.noreply.github.com> --- src/usr/linux/list.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/usr/linux/list.h b/src/usr/linux/list.h index 436c8389d..9df646991 100644 --- a/src/usr/linux/list.h +++ b/src/usr/linux/list.h @@ -412,6 +412,15 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_next_entry(pos, member) \ list_entry((pos)->member.next, typeof(*(pos)), member) +/** + * list_next_entry_or_null - get the next element in list unless it's the head + * @head: the list head + * @pos: cursor: pointer to the current entry + * @member: the name of the list_head within the entry. + */ +#define list_next_entry_or_null(head, pos, member) \ + (((pos)->member.next == head) ? NULL : list_next_entry(pos, member)) + /** * list_prev_entry - get the prev element in list * @pos: the type * to cursor @@ -772,4 +781,3 @@ static inline void hlist_move_list(struct hlist_head *old, #endif - From 4798d83bc57598d116fb1ca51c76e7038c655eb1 Mon Sep 17 00:00:00 2001 From: Arne Redlich <405850+redlicha@users.noreply.github.com> Date: Mon, 11 Nov 2024 11:39:40 +0100 Subject: [PATCH 2/3] xio_tcp_datapath: don't use list_first_entry_or_null on list entries Using list_first_entry_or_null on a list member will never return NULL, but rather either the next item if there is one or the head of the list if there's no item. Use list_next_entry_or_null instead. Signed-off-by: Arne Redlich <405850+redlicha@users.noreply.github.com> --- src/usr/transport/tcp/xio_tcp_datapath.c | 31 ++++++++++++------------ 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/usr/transport/tcp/xio_tcp_datapath.c b/src/usr/transport/tcp/xio_tcp_datapath.c index e450f892e..1985fc603 100644 --- a/src/usr/transport/tcp/xio_tcp_datapath.c +++ b/src/usr/transport/tcp/xio_tcp_datapath.c @@ -1106,9 +1106,9 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) /* if "ready to send queue" is not empty */ while (likely(tcp_hndl->tx_ready_tasks_num && (tcp_hndl->tx_comp_cnt < COMPLETION_BATCH_MAX))) { - next_task = list_first_entry_or_null(&task->tasks_list_entry, - struct xio_task, - tasks_list_entry); + next_task = list_next_entry_or_null(&tcp_hndl->tx_ready_list, + task, + tasks_list_entry); next_tcp_task = next_task ? (struct xio_tcp_task *)next_task->dd_data : NULL; @@ -1173,10 +1173,9 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) tcp_task = (struct xio_tcp_task *)task->dd_data; tcp_task->txd.stage = XIO_TCP_TX_IN_SEND_DATA; tcp_task->txd.ctl_msg_len = 0; - task = list_first_entry_or_null( - &task->tasks_list_entry, - struct xio_task, - tasks_list_entry); + task = list_next_entry_or_null(&tcp_hndl->tx_ready_list, + task, + tasks_list_entry); } if (tcp_hndl->tmp_work.msg.msg_iovlen) { tcp_task = (struct xio_tcp_task *)task->dd_data; @@ -3055,9 +3054,9 @@ int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) if (tcp_task->rxd.stage != XIO_TCP_RX_IO_DATA) break; - next_task = list_first_entry_or_null( - &task->tasks_list_entry, - struct xio_task, tasks_list_entry); + next_task = list_next_entry_or_null(&tcp_hndl->rx_list, + task, + tasks_list_entry); next_tcp_task = next_task ? (struct xio_tcp_task *) next_task->dd_data : NULL; next_rxd_work = (next_tcp_task && @@ -3120,9 +3119,9 @@ int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) iov_len -= rxd_work->msg.msg_iovlen; bytes_recv -= rxd_work->tot_iov_byte_len; - task = list_first_entry(&task->tasks_list_entry, - struct xio_task, - tasks_list_entry); + task = list_next_entry_or_null(&tcp_hndl->rx_list, + task, + tasks_list_entry); } if (tcp_hndl->tmp_work.msg.msg_iovlen) { tcp_task = (struct xio_tcp_task *)task->dd_data; @@ -3361,8 +3360,9 @@ int xio_tcp_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) if (tcp_task->rxd.tot_iov_byte_len) tcp_hndl->io_waiting_tasks++; - task = list_first_entry(&task->tasks_list_entry, - struct xio_task, tasks_list_entry); + task = list_next_entry_or_null(&tcp_hndl->rx_list, + task, + tasks_list_entry); } if (count == 0) @@ -3429,4 +3429,3 @@ int xio_tcp_poll(struct xio_transport_base *transport, return nr_comp; } - From 426eafabec4ed6ffff4d01a8647bc5d75010712b Mon Sep 17 00:00:00 2001 From: Arne Redlich <405850+redlicha@users.noreply.github.com> Date: Mon, 11 Nov 2024 11:57:14 +0100 Subject: [PATCH 3/3] Remove xio_tcp_transport.tx_ready_tasks_num `tx_ready_tasks_num` and `tx_ready_list` going out of sync after network problems is the source of hitting "unknown TX stage" errors in `xio_tcp_xmit`, which is caused by dereferencing an invalid `tcp_task` pointer, since the list iteration is based on `tx_ready_tasks_num` and not on the actual list: ``` (gdb) print tcp_hndl->tx_ready_tasks_num $4 = 1 (gdb) print tcp_hndl->tx_ready_list $5 = {next = 0x7fc02ecf2778, prev = 0x7fc02ecf2778} ``` Prior to [1] this would result in an infinite loop, with that commit the connection is disconnected. The exact reason for `tx_ready_tasks_num` and `tx_ready_list` going out of sync is not clear, but code inspection suggests that in most places we only need to know whether the `tx_ready_list` is empty or not, with the exception of the following two annotated snippets in `xio_tcp_xmit` that look at the actual value of `tx_ready_tasks_num` while handling batching: ``` int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) { while (likely(tcp_hndl->tx_ready_tasks_num && (tcp_hndl->tx_comp_cnt < COMPLETION_BATCH_MAX))) { next_task = list_first_entry_or_null(&task->tasks_list_entry, struct xio_task, tasks_list_entry); next_tcp_task = next_task ? (struct xio_tcp_task *)next_task->dd_data : NULL; // [...] switch (tcp_task->txd.stage) { // [...] case XIO_TCP_TX_IN_SEND_CTL: // [...] ++batch_count; // - tx_ready_tasks_num > 0 (otherwise we'd // not be in the loop anymore) // - batch_count == tx_ready_tasks_num when // next_task == NULL // -> we can drop the batch_count != tx_ready_tasks_num // check if (batch_count != batch_nr && batch_count != tcp_hndl->tx_ready_tasks_num && next_task && (next_tcp_task->txd.stage == XIO_TCP_TX_IN_SEND_DATA) && (next_tcp_task->txd.msg.msg_iovlen + tcp_hndl->tmp_work.msg_len) < IOV_MAX) { task = next_task; break; } // [...] case XIO_TCP_TX_IN_SEND_DATA: // [...] // - tx_ready_tasks_num > 0 (otherwise we'd // not be in the loop anymore) // - batch_count == tx_ready_tasks_num when // next_task == NULL // -> we can drop the batch_count != tx_ready_tasks_num // check ++batch_count; if (batch_count != batch_nr && batch_count != tcp_hndl->tx_ready_tasks_num && next_task && (next_tcp_task->txd.stage == XIO_TCP_TX_IN_SEND_DATA) && (next_tcp_task->txd.msg.msg_iovlen + tcp_hndl->tmp_work.msg_len) < IOV_MAX) { task = next_task; break; } // [...] } } ``` [1] "9d5f92077e645a6c7461dd4ab030df3a75417b2f accelio: disconnect on tx stage error" Signed-off-by: Arne Redlich <405850+redlicha@users.noreply.github.com> --- src/usr/transport/tcp/xio_tcp_datapath.c | 18 +++++------------- src/usr/transport/tcp/xio_tcp_management.c | 4 +--- src/usr/transport/tcp/xio_tcp_transport.h | 4 ++-- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/src/usr/transport/tcp/xio_tcp_datapath.c b/src/usr/transport/tcp/xio_tcp_datapath.c index 1985fc603..3a7671770 100644 --- a/src/usr/transport/tcp/xio_tcp_datapath.c +++ b/src/usr/transport/tcp/xio_tcp_datapath.c @@ -1013,7 +1013,7 @@ static void xio_tcp_tx_completion_handler(int actual_timeout_ms, void *xio_task) &tcp_hndl->disconnect_event); } else { xio_context_disable_event(&tcp_hndl->disconnect_event); - if (tcp_hndl->tx_ready_tasks_num) + if (!list_empty(&tcp_hndl->tx_ready_list)) xio_tcp_xmit(tcp_hndl); } } @@ -1093,7 +1093,7 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) unsigned int iov_len; uint64_t bytes_sent; - if (tcp_hndl->tx_ready_tasks_num == 0 || + if (list_empty(&tcp_hndl->tx_ready_list) || tcp_hndl->tx_comp_cnt > COMPLETION_BATCH_MAX || tcp_hndl->state != XIO_TRANSPORT_STATE_CONNECTED) { xio_set_error(XIO_EAGAIN); @@ -1104,7 +1104,7 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) tasks_list_entry); /* if "ready to send queue" is not empty */ - while (likely(tcp_hndl->tx_ready_tasks_num && + while (likely(!list_empty(&tcp_hndl->tx_ready_list) && (tcp_hndl->tx_comp_cnt < COMPLETION_BATCH_MAX))) { next_task = list_next_entry_or_null(&tcp_hndl->tx_ready_list, task, @@ -1138,7 +1138,6 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) ++batch_count; if (batch_count != batch_nr && - batch_count != tcp_hndl->tx_ready_tasks_num && next_task && next_tcp_task->txd.stage <= XIO_TCP_TX_IN_SEND_CTL) { @@ -1238,7 +1237,6 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) ++batch_count; if (batch_count != batch_nr && - batch_count != tcp_hndl->tx_ready_tasks_num && next_task && (next_tcp_task->txd.stage == XIO_TCP_TX_IN_SEND_DATA) && @@ -1273,8 +1271,6 @@ int xio_tcp_xmit(struct xio_tcp_transport *tcp_hndl) iov_len -= tcp_task->txd.msg.msg_iovlen; bytes_sent -= tcp_task->txd.tot_iov_byte_len; - tcp_hndl->tx_ready_tasks_num--; - list_move_tail(&task->tasks_list_entry, &tcp_hndl->in_flight_list); @@ -1604,8 +1600,6 @@ static int xio_tcp_send_req(struct xio_tcp_transport *tcp_hndl, else list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); - tcp_hndl->tx_ready_tasks_num++; - retval = xio_tcp_xmit(tcp_hndl); if (retval) { if (xio_errno() != XIO_EAGAIN) { @@ -1990,8 +1984,6 @@ static int xio_tcp_send_rsp(struct xio_tcp_transport *tcp_hndl, else list_move_tail(&task->tasks_list_entry, &tcp_hndl->tx_ready_list); - tcp_hndl->tx_ready_tasks_num++; - retval = xio_tcp_xmit(tcp_hndl); if (retval) { /* no need xio_get_last_error here */ @@ -3202,7 +3194,7 @@ int xio_tcp_rx_data_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) tasks_list_entry); } - if (tcp_hndl->tx_ready_tasks_num) { + if (!list_empty(&tcp_hndl->tx_ready_list)) { retval = xio_tcp_xmit(tcp_hndl); if (retval < 0) { if (xio_errno() != XIO_EAGAIN) { @@ -3373,7 +3365,7 @@ int xio_tcp_rx_ctl_handler(struct xio_tcp_transport *tcp_hndl, int batch_nr) return retval; count = retval; - if (tcp_hndl->tx_ready_tasks_num) { + if (!list_empty(&tcp_hndl->tx_ready_list)) { retval = xio_tcp_xmit(tcp_hndl); if (retval < 0) { if (xio_errno() != XIO_EAGAIN) { diff --git a/src/usr/transport/tcp/xio_tcp_management.c b/src/usr/transport/tcp/xio_tcp_management.c index 87ee268e3..079927933 100644 --- a/src/usr/transport/tcp/xio_tcp_management.c +++ b/src/usr/transport/tcp/xio_tcp_management.c @@ -188,8 +188,6 @@ static int xio_tcp_flush_all_tasks(struct xio_tcp_transport *tcp_hndl) xio_tasks_list_flush(&tcp_hndl->rx_list); } - tcp_hndl->tx_ready_tasks_num = 0; - return 0; } @@ -907,7 +905,7 @@ struct xio_tcp_transport *xio_tcp_transport_create( tcp_hndl->tmp_rx_buf_cur = NULL; tcp_hndl->tmp_rx_buf_len = 0; - tcp_hndl->tx_ready_tasks_num = 0; + tcp_hndl->_pad = 0; tcp_hndl->tx_comp_cnt = 0; memset(&tcp_hndl->tmp_work, 0, sizeof(struct xio_tcp_work_req)); diff --git a/src/usr/transport/tcp/xio_tcp_transport.h b/src/usr/transport/tcp/xio_tcp_transport.h index 888878f57..6aca93683 100644 --- a/src/usr/transport/tcp/xio_tcp_transport.h +++ b/src/usr/transport/tcp/xio_tcp_transport.h @@ -321,8 +321,8 @@ struct xio_tcp_transport { /* tx parameters */ size_t max_inline_buf_sz; - - int tx_ready_tasks_num; + /* this used to be `tx_ready_tasks_num` - keeping it to satisfy `-Werror=padded` */ + int _pad; uint16_t tx_comp_cnt;