Revert kvec msg iterator before trying to process a TLS alert when possible.
In nvmet_tcp_try_recv_data(), it's assumed that no msg control message buffer is set prior to sock_recvmsg(). If no control message structure is setup, kTLS layer will read and process TLS data record types. As soon as it encounters a TLS control message, it would return an error. At that point, we setup a kvec backed control buffer and read in the control message such as a TLS alert. Msg can advance the kvec pointer as a part of the copy process thus we need to revert the iterator before calling into the tls_alert_recv.
Fixes: a1c5dd8355b1 ("nvmet-tcp: control messages for recvmsg()") Cc: Stable@vger.kernel.org Signed-off-by: Olga Kornievskaia okorniev@redhat.com --- drivers/nvme/target/tcp.c | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 688033b88d38..cf3336ddc9a3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1161,6 +1161,7 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(len < 0)) return len; if (queue->tls_pskid) { + iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret; @@ -1218,18 +1219,47 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; int len, ret; + union { + struct cmsghdr cmsg; + u8 buf[CMSG_SPACE(sizeof(u8))]; + } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_control = &u, + .msg_controllen = sizeof(u), + };
while (msg_data_left(&cmd->recv_msg)) { + /* assumed that cmg->recv_msg's control buffer is not setup + */ + WARN_ON_ONCE(cmd->recv_msg.msg_controllen > 0); + len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, cmd->recv_msg.msg_flags); + if (cmd->recv_msg.msg_flags & MSG_CTRUNC) { + cmd->recv_msg.msg_flags &= ~(MSG_CTRUNC | MSG_EOF); + if (len == 0 || len == -EIO) { + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, + 1, alert_kvec.iov_len); + len = sock_recvmsg(cmd->queue->sock, &msg, + MSG_DONTWAIT); + if (len > 0 && + tls_get_record_type(cmd->queue->sock->sk, + &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, len); + ret = nvmet_tcp_tls_record_ok(cmd->queue, + &msg, u.buf); + if (ret < 0) + return ret; + } + } + } if (len <= 0) return len; - if (queue->tls_pskid) { - ret = nvmet_tcp_tls_record_ok(cmd->queue, - &cmd->recv_msg, cmd->recv_cbuf); - if (ret < 0) - return ret; - }
cmd->pdu_recv += len; cmd->rbytes_done += len; @@ -1267,6 +1297,7 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) if (unlikely(len < 0)) return len; if (queue->tls_pskid) { + iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret; @@ -1453,10 +1484,6 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, if (!c->r2t_pdu) goto out_free_data;
- if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) { - c->recv_msg.msg_control = c->recv_cbuf; - c->recv_msg.msg_controllen = sizeof(c->recv_cbuf); - } c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
list_add_tail(&c->entry, &queue->free_list); @@ -1736,6 +1763,7 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) return len; }
+ iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;
On 7/29/25 18:40, Olga Kornievskaia wrote:
Revert kvec msg iterator before trying to process a TLS alert when possible.
In nvmet_tcp_try_recv_data(), it's assumed that no msg control message buffer is set prior to sock_recvmsg(). If no control message structure is setup, kTLS layer will read and process TLS data record types. As soon as it encounters a TLS control message, it would return an error. At that point, we setup a kvec backed control buffer and read in the control message such as a TLS alert. Msg can advance the kvec pointer as a part of the copy process thus we need to revert the iterator before calling into the tls_alert_recv.
Fixes: a1c5dd8355b1 ("nvmet-tcp: control messages for recvmsg()") Cc: Stable@vger.kernel.org Signed-off-by: Olga Kornievskaia okorniev@redhat.com
drivers/nvme/target/tcp.c | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 688033b88d38..cf3336ddc9a3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1161,6 +1161,7 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(len < 0)) return len; if (queue->tls_pskid) {
ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;iov_iter_revert(&msg.msg_iter, len);
@@ -1218,18 +1219,47 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; int len, ret;
- union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
- } u;
- u8 alert[2];
- struct kvec alert_kvec = {
.iov_base = alert,
.iov_len = sizeof(alert),
- };
- struct msghdr msg = {
.msg_control = &u,
.msg_controllen = sizeof(u),
- };
while (msg_data_left(&cmd->recv_msg)) {
/* assumed that cmg->recv_msg's control buffer is not setup
*/
WARN_ON_ONCE(cmd->recv_msg.msg_controllen > 0);
- len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, cmd->recv_msg.msg_flags);
if (cmd->recv_msg.msg_flags & MSG_CTRUNC) {
Hmm. Looks as if we were getting MSG_CTRUNC even if no buffer is passed. OK.
cmd->recv_msg.msg_flags &= ~(MSG_CTRUNC | MSG_EOF);
Not sure with this. We had _terrible_ issues with MSG_EOF not set correctly (basically the TLS layer would wait for more data to be received before the record is shipped out, causing massive delays and connection resets). Any reason for clearing MSG_EOF here?
if (len == 0 || len == -EIO) {
iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec,
1, alert_kvec.iov_len);
len = sock_recvmsg(cmd->queue->sock, &msg,
MSG_DONTWAIT);
if (len > 0 &&
tls_get_record_type(cmd->queue->sock->sk,
&u.cmsg) == TLS_RECORD_TYPE_ALERT) {
iov_iter_revert(&msg.msg_iter, len);
ret = nvmet_tcp_tls_record_ok(cmd->queue,
&msg, u.buf);
Can't we just skip this part (ie _not_ receiving / reading the control message contents? It's not that we're not doing anything useful here; for any TLS Alert we'll be resetting the connection.
if (ret < 0)
return ret;
}
}
if (len <= 0) return len;}
if (queue->tls_pskid) {
ret = nvmet_tcp_tls_record_ok(cmd->queue,
&cmd->recv_msg, cmd->recv_cbuf);
if (ret < 0)
return ret;
}
cmd->pdu_recv += len; cmd->rbytes_done += len; @@ -1267,6 +1297,7 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) if (unlikely(len < 0)) return len; if (queue->tls_pskid) {
ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;iov_iter_revert(&msg.msg_iter, len);
@@ -1453,10 +1484,6 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, if (!c->r2t_pdu) goto out_free_data;
- if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
c->recv_msg.msg_control = c->recv_cbuf;
c->recv_msg.msg_controllen = sizeof(c->recv_cbuf);
- } c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
list_add_tail(&c->entry, &queue->free_list); @@ -1736,6 +1763,7 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) return len; }
- iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;
Huh? Why do we need to do that?
Cheers,
Hannes
@stable@vger.kernel.org I apologize that git send-mail went to stable email. Please ignore this email as this is an internal discussion for now. But will be public soon.
On Tue, Jul 29, 2025 at 1:47 PM Hannes Reinecke hare@suse.de wrote:
On 7/29/25 18:40, Olga Kornievskaia wrote:
Revert kvec msg iterator before trying to process a TLS alert when possible.
In nvmet_tcp_try_recv_data(), it's assumed that no msg control message buffer is set prior to sock_recvmsg(). If no control message structure is setup, kTLS layer will read and process TLS data record types. As soon as it encounters a TLS control message, it would return an error. At that point, we setup a kvec backed control buffer and read in the control message such as a TLS alert. Msg can advance the kvec pointer as a part of the copy process thus we need to revert the iterator before calling into the tls_alert_recv.
Fixes: a1c5dd8355b1 ("nvmet-tcp: control messages for recvmsg()") Cc: Stable@vger.kernel.org Signed-off-by: Olga Kornievskaia okorniev@redhat.com
drivers/nvme/target/tcp.c | 48 +++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 688033b88d38..cf3336ddc9a3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1161,6 +1161,7 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) if (unlikely(len < 0)) return len; if (queue->tls_pskid) {
iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;
@@ -1218,18 +1219,47 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) { struct nvmet_tcp_cmd *cmd = queue->cmd; int len, ret;
union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
} u;
u8 alert[2];
struct kvec alert_kvec = {
.iov_base = alert,
.iov_len = sizeof(alert),
};
struct msghdr msg = {
.msg_control = &u,
.msg_controllen = sizeof(u),
}; while (msg_data_left(&cmd->recv_msg)) {
/* assumed that cmg->recv_msg's control buffer is not setup
*/
WARN_ON_ONCE(cmd->recv_msg.msg_controllen > 0);
len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, cmd->recv_msg.msg_flags);
if (cmd->recv_msg.msg_flags & MSG_CTRUNC) {
Hmm. Looks as if we were getting MSG_CTRUNC even if no buffer is passed. OK.
Correct. When no control buffer is supplied, put_cmsg() would always set MSG_CTRUNC. What's important is that in combination with -EIO return that signals it's a TLS control message. Just looking at CTRUNC message would get us into trouble.
cmd->recv_msg.msg_flags &= ~(MSG_CTRUNC | MSG_EOF);
Not sure with this. We had _terrible_ issues with MSG_EOF not set correctly (basically the TLS layer would wait for more data to be received before the record is shipped out, causing massive delays and connection resets). Any reason for clearing MSG_EOF here?
Oh my, thank you for catching this. It should have been EOR and not EOF. As to why clearing MSG_EOR was based on a comment in NFS code that TLS always sets EOF for every data record (and looking at the TLS tls_sw_recvmsg).
if (len == 0 || len == -EIO) {
iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec,
1, alert_kvec.iov_len);
len = sock_recvmsg(cmd->queue->sock, &msg,
MSG_DONTWAIT);
if (len > 0 &&
tls_get_record_type(cmd->queue->sock->sk,
&u.cmsg) == TLS_RECORD_TYPE_ALERT) {
iov_iter_revert(&msg.msg_iter, len);
ret = nvmet_tcp_tls_record_ok(cmd->queue,
&msg, u.buf);
Can't we just skip this part (ie _not_ receiving / reading the control message contents? It's not that we're not doing anything useful here; for any TLS Alert we'll be resetting the connection.
Yes we certainly can do something like what you had previously if (len == 0 || len == -EIO) { pr_err("queue %d: unhandled control message\n", queue->idx); return -EAGAIN; }
But not that in that case TLS alert bytes would still be unconsumed staying in the socket. If what follows is a prompt closing of the connection, then who cares. What has been worrying me (even in the NFS) code is what happens when TLS alert (though we do consume it) isn't followed by the connection closing.
Now let me know if you want me to re-do this patch based on what you suggested and ignore when len=-EIO. And then I can redo the patch#4 based on that.
if (ret < 0)
return ret;
}
}
} if (len <= 0) return len;
if (queue->tls_pskid) {
ret = nvmet_tcp_tls_record_ok(cmd->queue,
&cmd->recv_msg, cmd->recv_cbuf);
if (ret < 0)
return ret;
} cmd->pdu_recv += len; cmd->rbytes_done += len;
@@ -1267,6 +1297,7 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) if (unlikely(len < 0)) return len; if (queue->tls_pskid) {
iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;
@@ -1453,10 +1484,6 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, if (!c->r2t_pdu) goto out_free_data;
if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
c->recv_msg.msg_control = c->recv_cbuf;
c->recv_msg.msg_controllen = sizeof(c->recv_cbuf);
} c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; list_add_tail(&c->entry, &queue->free_list);
@@ -1736,6 +1763,7 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) return len; }
iov_iter_revert(&msg.msg_iter, len); ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf); if (ret < 0) return ret;
Huh? Why do we need to do that?
Why do we need an iov_iter_revert? As it's also done in the other 3 chucks.
kernel_recvmsg which ultimately calls sock_recvmsg will ultimately call into copy_to_iter() which eventually will a function to advance a kvec so when tls_alert_recv() tries to look into kvec inside the msg iterator is pointing into invalid memory.
sock_recvmsg -> inet_recvmsg ->tls_sw_recvmsg -> skb_copy_datagram_iter-> simple_copy_to_iter-> copy_to_iter ->_copy_to_iter ->iterate_and_advance ->iterate_and_advance-> ->iterate_and_advance2 ->iterate_kvec
A typical usage of sock_recvmsg is to initialize the iterator's kvec (iov_base) with some buffer but afterwards processing of that payload is done by looking thru the pointer used in the initialization. You don't look into the iterator's kvec.
Cheers,
Hannes
Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
linux-stable-mirror@lists.linaro.org