When BPF_TRAMP_F_CALL_ORIG is enabled, the address of a bpf_tramp_image
struct on the stack is passed during the size calculation pass and
an address on the heap is passed during code generation. This may
cause a heap buffer overflow if the heap address is tagged because
emit_a64_mov_i64() will emit longer code than it did during the size
calculation pass. The same problem could occur without tag-based
KASAN if one of the 16-bit words of the stack address happened to
be all-ones during the size calculation pass. Fix the problem by
assuming the worst case (4 instructions) when calculating the size
of the bpf_tramp_image address emission.
Fixes: 19d3c179a377 ("bpf, arm64: Fix trampoline for BPF_TRAMP_F_CALL_ORIG")
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Link: https://linux-review.googlesource.com/id/I1496f2bc24fba7a1d492e16e2b94cf437…
Cc: stable(a)vger.kernel.org
---
arch/arm64/net/bpf_jit_comp.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 8bbd0b20136a8..5db82bfc9dc11 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2220,7 +2220,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
emit(A64_STR64I(A64_R(20), A64_SP, regs_off + 8), ctx);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- emit_a64_mov_i64(A64_R(0), (const u64)im, ctx);
+ /* for the first pass, assume the worst case */
+ if (!ctx->image)
+ ctx->idx += 4;
+ else
+ emit_a64_mov_i64(A64_R(0), (const u64)im, ctx);
emit_call((const u64)__bpf_tramp_enter, ctx);
}
@@ -2264,7 +2268,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im,
if (flags & BPF_TRAMP_F_CALL_ORIG) {
im->ip_epilogue = ctx->ro_image + ctx->idx;
- emit_a64_mov_i64(A64_R(0), (const u64)im, ctx);
+ /* for the first pass, assume the worst case */
+ if (!ctx->image)
+ ctx->idx += 4;
+ else
+ emit_a64_mov_i64(A64_R(0), (const u64)im, ctx);
emit_call((const u64)__bpf_tramp_exit, ctx);
}
--
2.47.0.rc1.288.g06298d1525-goog
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 30c9ae5ece8ecd69d36e6912c2c0896418f2468c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102146-theme-encircle-9ead@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30c9ae5ece8ecd69d36e6912c2c0896418f2468c Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Wed, 16 Oct 2024 17:00:00 +0300
Subject: [PATCH] xhci: dbc: honor usb transfer size boundaries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Treat each completed full size write to /dev/ttyDBC0 as a separate usb
transfer. Make sure the size of the TRBs matches the size of the tty
write by first queuing as many max packet size TRBs as possible up to
the last TRB which will be cut short to match the size of the tty write.
This solves an issue where userspace writes several transfers back to
back via /dev/ttyDBC0 into a kfifo before dbgtty can find available
request to turn that kfifo data into TRBs on the transfer ring.
The boundary between transfer was lost as xhci-dbgtty then turned
everyting in the kfifo into as many 'max packet size' TRBs as possible.
DbC would then send more data to the host than intended for that
transfer, causing host to issue a babble error.
Refuse to write more data to kfifo until previous tty write data is
turned into properly sized TRBs with data size boundaries matching tty
write size
Tested-by: Uday M Bhat <uday.m.bhat(a)intel.com>
Tested-by: Łukasz Bartosik <ukaszb(a)chromium.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241016140000.783905-5-mathias.nyman@linux.intel…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 8ec813b6e9fd..9dc8f4d8077c 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -110,6 +110,7 @@ struct dbc_port {
struct tasklet_struct push;
struct list_head write_pool;
+ unsigned int tx_boundary;
bool registered;
};
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index b8e78867e25a..d719c16ea30b 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -24,6 +24,29 @@ static inline struct dbc_port *dbc_to_port(struct xhci_dbc *dbc)
return dbc->priv;
}
+static unsigned int
+dbc_kfifo_to_req(struct dbc_port *port, char *packet)
+{
+ unsigned int len;
+
+ len = kfifo_len(&port->port.xmit_fifo);
+
+ if (len == 0)
+ return 0;
+
+ len = min(len, DBC_MAX_PACKET);
+
+ if (port->tx_boundary)
+ len = min(port->tx_boundary, len);
+
+ len = kfifo_out(&port->port.xmit_fifo, packet, len);
+
+ if (port->tx_boundary)
+ port->tx_boundary -= len;
+
+ return len;
+}
+
static int dbc_start_tx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
@@ -36,7 +59,7 @@ static int dbc_start_tx(struct dbc_port *port)
while (!list_empty(pool)) {
req = list_entry(pool->next, struct dbc_request, list_pool);
- len = kfifo_out(&port->port.xmit_fifo, req->buf, DBC_MAX_PACKET);
+ len = dbc_kfifo_to_req(port, req->buf);
if (len == 0)
break;
do_tty_wake = true;
@@ -200,14 +223,32 @@ static ssize_t dbc_tty_write(struct tty_struct *tty, const u8 *buf,
{
struct dbc_port *port = tty->driver_data;
unsigned long flags;
+ unsigned int written = 0;
spin_lock_irqsave(&port->port_lock, flags);
- if (count)
- count = kfifo_in(&port->port.xmit_fifo, buf, count);
- dbc_start_tx(port);
+
+ /*
+ * Treat tty write as one usb transfer. Make sure the writes are turned
+ * into TRB request having the same size boundaries as the tty writes.
+ * Don't add data to kfifo before previous write is turned into TRBs
+ */
+ if (port->tx_boundary) {
+ spin_unlock_irqrestore(&port->port_lock, flags);
+ return 0;
+ }
+
+ if (count) {
+ written = kfifo_in(&port->port.xmit_fifo, buf, count);
+
+ if (written == count)
+ port->tx_boundary = kfifo_len(&port->port.xmit_fifo);
+
+ dbc_start_tx(port);
+ }
+
spin_unlock_irqrestore(&port->port_lock, flags);
- return count;
+ return written;
}
static int dbc_tty_put_char(struct tty_struct *tty, u8 ch)
@@ -241,6 +282,10 @@ static unsigned int dbc_tty_write_room(struct tty_struct *tty)
spin_lock_irqsave(&port->port_lock, flags);
room = kfifo_avail(&port->port.xmit_fifo);
+
+ if (port->tx_boundary)
+ room = 0;
+
spin_unlock_irqrestore(&port->port_lock, flags);
return room;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 30c9ae5ece8ecd69d36e6912c2c0896418f2468c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102144-cinch-foster-09d5@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30c9ae5ece8ecd69d36e6912c2c0896418f2468c Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Wed, 16 Oct 2024 17:00:00 +0300
Subject: [PATCH] xhci: dbc: honor usb transfer size boundaries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Treat each completed full size write to /dev/ttyDBC0 as a separate usb
transfer. Make sure the size of the TRBs matches the size of the tty
write by first queuing as many max packet size TRBs as possible up to
the last TRB which will be cut short to match the size of the tty write.
This solves an issue where userspace writes several transfers back to
back via /dev/ttyDBC0 into a kfifo before dbgtty can find available
request to turn that kfifo data into TRBs on the transfer ring.
The boundary between transfer was lost as xhci-dbgtty then turned
everyting in the kfifo into as many 'max packet size' TRBs as possible.
DbC would then send more data to the host than intended for that
transfer, causing host to issue a babble error.
Refuse to write more data to kfifo until previous tty write data is
turned into properly sized TRBs with data size boundaries matching tty
write size
Tested-by: Uday M Bhat <uday.m.bhat(a)intel.com>
Tested-by: Łukasz Bartosik <ukaszb(a)chromium.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241016140000.783905-5-mathias.nyman@linux.intel…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 8ec813b6e9fd..9dc8f4d8077c 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -110,6 +110,7 @@ struct dbc_port {
struct tasklet_struct push;
struct list_head write_pool;
+ unsigned int tx_boundary;
bool registered;
};
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index b8e78867e25a..d719c16ea30b 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -24,6 +24,29 @@ static inline struct dbc_port *dbc_to_port(struct xhci_dbc *dbc)
return dbc->priv;
}
+static unsigned int
+dbc_kfifo_to_req(struct dbc_port *port, char *packet)
+{
+ unsigned int len;
+
+ len = kfifo_len(&port->port.xmit_fifo);
+
+ if (len == 0)
+ return 0;
+
+ len = min(len, DBC_MAX_PACKET);
+
+ if (port->tx_boundary)
+ len = min(port->tx_boundary, len);
+
+ len = kfifo_out(&port->port.xmit_fifo, packet, len);
+
+ if (port->tx_boundary)
+ port->tx_boundary -= len;
+
+ return len;
+}
+
static int dbc_start_tx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
@@ -36,7 +59,7 @@ static int dbc_start_tx(struct dbc_port *port)
while (!list_empty(pool)) {
req = list_entry(pool->next, struct dbc_request, list_pool);
- len = kfifo_out(&port->port.xmit_fifo, req->buf, DBC_MAX_PACKET);
+ len = dbc_kfifo_to_req(port, req->buf);
if (len == 0)
break;
do_tty_wake = true;
@@ -200,14 +223,32 @@ static ssize_t dbc_tty_write(struct tty_struct *tty, const u8 *buf,
{
struct dbc_port *port = tty->driver_data;
unsigned long flags;
+ unsigned int written = 0;
spin_lock_irqsave(&port->port_lock, flags);
- if (count)
- count = kfifo_in(&port->port.xmit_fifo, buf, count);
- dbc_start_tx(port);
+
+ /*
+ * Treat tty write as one usb transfer. Make sure the writes are turned
+ * into TRB request having the same size boundaries as the tty writes.
+ * Don't add data to kfifo before previous write is turned into TRBs
+ */
+ if (port->tx_boundary) {
+ spin_unlock_irqrestore(&port->port_lock, flags);
+ return 0;
+ }
+
+ if (count) {
+ written = kfifo_in(&port->port.xmit_fifo, buf, count);
+
+ if (written == count)
+ port->tx_boundary = kfifo_len(&port->port.xmit_fifo);
+
+ dbc_start_tx(port);
+ }
+
spin_unlock_irqrestore(&port->port_lock, flags);
- return count;
+ return written;
}
static int dbc_tty_put_char(struct tty_struct *tty, u8 ch)
@@ -241,6 +282,10 @@ static unsigned int dbc_tty_write_room(struct tty_struct *tty)
spin_lock_irqsave(&port->port_lock, flags);
room = kfifo_avail(&port->port.xmit_fifo);
+
+ if (port->tx_boundary)
+ room = 0;
+
spin_unlock_irqrestore(&port->port_lock, flags);
return room;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 30c9ae5ece8ecd69d36e6912c2c0896418f2468c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102143-chevy-extras-add3@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30c9ae5ece8ecd69d36e6912c2c0896418f2468c Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Wed, 16 Oct 2024 17:00:00 +0300
Subject: [PATCH] xhci: dbc: honor usb transfer size boundaries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Treat each completed full size write to /dev/ttyDBC0 as a separate usb
transfer. Make sure the size of the TRBs matches the size of the tty
write by first queuing as many max packet size TRBs as possible up to
the last TRB which will be cut short to match the size of the tty write.
This solves an issue where userspace writes several transfers back to
back via /dev/ttyDBC0 into a kfifo before dbgtty can find available
request to turn that kfifo data into TRBs on the transfer ring.
The boundary between transfer was lost as xhci-dbgtty then turned
everyting in the kfifo into as many 'max packet size' TRBs as possible.
DbC would then send more data to the host than intended for that
transfer, causing host to issue a babble error.
Refuse to write more data to kfifo until previous tty write data is
turned into properly sized TRBs with data size boundaries matching tty
write size
Tested-by: Uday M Bhat <uday.m.bhat(a)intel.com>
Tested-by: Łukasz Bartosik <ukaszb(a)chromium.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241016140000.783905-5-mathias.nyman@linux.intel…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 8ec813b6e9fd..9dc8f4d8077c 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -110,6 +110,7 @@ struct dbc_port {
struct tasklet_struct push;
struct list_head write_pool;
+ unsigned int tx_boundary;
bool registered;
};
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index b8e78867e25a..d719c16ea30b 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -24,6 +24,29 @@ static inline struct dbc_port *dbc_to_port(struct xhci_dbc *dbc)
return dbc->priv;
}
+static unsigned int
+dbc_kfifo_to_req(struct dbc_port *port, char *packet)
+{
+ unsigned int len;
+
+ len = kfifo_len(&port->port.xmit_fifo);
+
+ if (len == 0)
+ return 0;
+
+ len = min(len, DBC_MAX_PACKET);
+
+ if (port->tx_boundary)
+ len = min(port->tx_boundary, len);
+
+ len = kfifo_out(&port->port.xmit_fifo, packet, len);
+
+ if (port->tx_boundary)
+ port->tx_boundary -= len;
+
+ return len;
+}
+
static int dbc_start_tx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
@@ -36,7 +59,7 @@ static int dbc_start_tx(struct dbc_port *port)
while (!list_empty(pool)) {
req = list_entry(pool->next, struct dbc_request, list_pool);
- len = kfifo_out(&port->port.xmit_fifo, req->buf, DBC_MAX_PACKET);
+ len = dbc_kfifo_to_req(port, req->buf);
if (len == 0)
break;
do_tty_wake = true;
@@ -200,14 +223,32 @@ static ssize_t dbc_tty_write(struct tty_struct *tty, const u8 *buf,
{
struct dbc_port *port = tty->driver_data;
unsigned long flags;
+ unsigned int written = 0;
spin_lock_irqsave(&port->port_lock, flags);
- if (count)
- count = kfifo_in(&port->port.xmit_fifo, buf, count);
- dbc_start_tx(port);
+
+ /*
+ * Treat tty write as one usb transfer. Make sure the writes are turned
+ * into TRB request having the same size boundaries as the tty writes.
+ * Don't add data to kfifo before previous write is turned into TRBs
+ */
+ if (port->tx_boundary) {
+ spin_unlock_irqrestore(&port->port_lock, flags);
+ return 0;
+ }
+
+ if (count) {
+ written = kfifo_in(&port->port.xmit_fifo, buf, count);
+
+ if (written == count)
+ port->tx_boundary = kfifo_len(&port->port.xmit_fifo);
+
+ dbc_start_tx(port);
+ }
+
spin_unlock_irqrestore(&port->port_lock, flags);
- return count;
+ return written;
}
static int dbc_tty_put_char(struct tty_struct *tty, u8 ch)
@@ -241,6 +282,10 @@ static unsigned int dbc_tty_write_room(struct tty_struct *tty)
spin_lock_irqsave(&port->port_lock, flags);
room = kfifo_avail(&port->port.xmit_fifo);
+
+ if (port->tx_boundary)
+ room = 0;
+
spin_unlock_irqrestore(&port->port_lock, flags);
return room;
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 30c9ae5ece8ecd69d36e6912c2c0896418f2468c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102142-eskimo-lumber-a654@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 30c9ae5ece8ecd69d36e6912c2c0896418f2468c Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Wed, 16 Oct 2024 17:00:00 +0300
Subject: [PATCH] xhci: dbc: honor usb transfer size boundaries.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Treat each completed full size write to /dev/ttyDBC0 as a separate usb
transfer. Make sure the size of the TRBs matches the size of the tty
write by first queuing as many max packet size TRBs as possible up to
the last TRB which will be cut short to match the size of the tty write.
This solves an issue where userspace writes several transfers back to
back via /dev/ttyDBC0 into a kfifo before dbgtty can find available
request to turn that kfifo data into TRBs on the transfer ring.
The boundary between transfer was lost as xhci-dbgtty then turned
everyting in the kfifo into as many 'max packet size' TRBs as possible.
DbC would then send more data to the host than intended for that
transfer, causing host to issue a babble error.
Refuse to write more data to kfifo until previous tty write data is
turned into properly sized TRBs with data size boundaries matching tty
write size
Tested-by: Uday M Bhat <uday.m.bhat(a)intel.com>
Tested-by: Łukasz Bartosik <ukaszb(a)chromium.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241016140000.783905-5-mathias.nyman@linux.intel…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 8ec813b6e9fd..9dc8f4d8077c 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -110,6 +110,7 @@ struct dbc_port {
struct tasklet_struct push;
struct list_head write_pool;
+ unsigned int tx_boundary;
bool registered;
};
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index b8e78867e25a..d719c16ea30b 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -24,6 +24,29 @@ static inline struct dbc_port *dbc_to_port(struct xhci_dbc *dbc)
return dbc->priv;
}
+static unsigned int
+dbc_kfifo_to_req(struct dbc_port *port, char *packet)
+{
+ unsigned int len;
+
+ len = kfifo_len(&port->port.xmit_fifo);
+
+ if (len == 0)
+ return 0;
+
+ len = min(len, DBC_MAX_PACKET);
+
+ if (port->tx_boundary)
+ len = min(port->tx_boundary, len);
+
+ len = kfifo_out(&port->port.xmit_fifo, packet, len);
+
+ if (port->tx_boundary)
+ port->tx_boundary -= len;
+
+ return len;
+}
+
static int dbc_start_tx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
@@ -36,7 +59,7 @@ static int dbc_start_tx(struct dbc_port *port)
while (!list_empty(pool)) {
req = list_entry(pool->next, struct dbc_request, list_pool);
- len = kfifo_out(&port->port.xmit_fifo, req->buf, DBC_MAX_PACKET);
+ len = dbc_kfifo_to_req(port, req->buf);
if (len == 0)
break;
do_tty_wake = true;
@@ -200,14 +223,32 @@ static ssize_t dbc_tty_write(struct tty_struct *tty, const u8 *buf,
{
struct dbc_port *port = tty->driver_data;
unsigned long flags;
+ unsigned int written = 0;
spin_lock_irqsave(&port->port_lock, flags);
- if (count)
- count = kfifo_in(&port->port.xmit_fifo, buf, count);
- dbc_start_tx(port);
+
+ /*
+ * Treat tty write as one usb transfer. Make sure the writes are turned
+ * into TRB request having the same size boundaries as the tty writes.
+ * Don't add data to kfifo before previous write is turned into TRBs
+ */
+ if (port->tx_boundary) {
+ spin_unlock_irqrestore(&port->port_lock, flags);
+ return 0;
+ }
+
+ if (count) {
+ written = kfifo_in(&port->port.xmit_fifo, buf, count);
+
+ if (written == count)
+ port->tx_boundary = kfifo_len(&port->port.xmit_fifo);
+
+ dbc_start_tx(port);
+ }
+
spin_unlock_irqrestore(&port->port_lock, flags);
- return count;
+ return written;
}
static int dbc_tty_put_char(struct tty_struct *tty, u8 ch)
@@ -241,6 +282,10 @@ static unsigned int dbc_tty_write_room(struct tty_struct *tty)
spin_lock_irqsave(&port->port_lock, flags);
room = kfifo_avail(&port->port.xmit_fifo);
+
+ if (port->tx_boundary)
+ room = 0;
+
spin_unlock_irqrestore(&port->port_lock, flags);
return room;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x fe49df60cdb7c2975aa743dc295f8786e4b7db10
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102120-valid-uncured-dcca@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From fe49df60cdb7c2975aa743dc295f8786e4b7db10 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Date: Wed, 16 Oct 2024 16:59:58 +0300
Subject: [PATCH] xhci: Mitigate failed set dequeue pointer commands
Avoid xHC host from processing a cancelled URB by always turning
cancelled URB TDs into no-op TRBs before queuing a 'Set TR Deq' command.
If the command fails then xHC will start processing the cancelled TD
instead of skipping it once endpoint is restarted, causing issues like
Babble error.
This is not a complete solution as a failed 'Set TR Deq' command does not
guarantee xHC TRB caches are cleared.
Fixes: 4db356924a50 ("xhci: turn cancelled td cleanup to its own function")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241016140000.783905-3-mathias.nyman@linux.intel…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 4d664ba53fe9..7dedf31bbddd 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1023,7 +1023,7 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
td_to_noop(xhci, ring, cached_td, false);
cached_td->cancel_status = TD_CLEARED;
}
-
+ td_to_noop(xhci, ring, td, false);
td->cancel_status = TD_CLEARING_CACHE;
cached_td = td;
break;
Hi Greg! Please consider picking up the following two bluetooth fixes
for the next round of stable updates, they fix problems quite a few
users hit in various stable series due to backports:
4084286151fc91 ("Bluetooth: btusb: Fix not being able to reconnect after
suspend") [v6.12-rc4] for 6.11.y
and
2c1dda2acc4192 ("Bluetooth: btusb: Fix regression with fake CSR
controllers 0a12:0001") [v6.12-rc4] for 5.10.y and later
For details see also:
https://lore.kernel.org/all/CABBYNZL0_j4EDWzDS=kXc1Vy0D6ToU+oYnP_uBWTKoXbEa…
tia!
Ciao, Thorsten
From: Eric Biggers <ebiggers(a)google.com>
Fix the kconfig option for the tas2781 HDA driver to select CRC32 rather
than CRC32_SARWATE. CRC32_SARWATE is an option from the kconfig
'choice' that selects the specific CRC32 implementation. Selecting a
'choice' option seems to have no effect, but even if it did work, it
would be incorrect for a random driver to override the user's choice.
CRC32 is the correct option to select for crc32() to be available.
Fixes: 5be27f1e3ec9 ("ALSA: hda/tas2781: Add tas2781 HDA driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
sound/pci/hda/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/pci/hda/Kconfig b/sound/pci/hda/Kconfig
index bb15a0248250c..68f1eee9e5c93 100644
--- a/sound/pci/hda/Kconfig
+++ b/sound/pci/hda/Kconfig
@@ -196,11 +196,11 @@ config SND_HDA_SCODEC_TAS2781_I2C
depends on ACPI
depends on EFI
depends on SND_SOC
select SND_SOC_TAS2781_COMLIB
select SND_SOC_TAS2781_FMWLIB
- select CRC32_SARWATE
+ select CRC32
help
Say Y or M here to include TAS2781 I2C HD-audio side codec support
in snd-hda-intel driver, such as ALC287.
comment "Set to Y if you want auto-loading the side codec driver"
base-commit: 715ca9dd687f89ddaac8ec8ccb3b5e5a30311a99
--
2.47.0
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x a985576af824426e33100554a5958a6beda60a1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102110-tableful-unnatural-5dab@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a985576af824426e33100554a5958a6beda60a13 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
Date: Thu, 3 Oct 2024 23:04:52 +0200
Subject: [PATCH] iio: adc: ti-lmp92064: add missing select
IIO_(TRIGGERED_)BUFFER in Kconfig
This driver makes use of triggered buffers, but does not select the
required modules.
Add the missing 'select IIO_BUFFER' and 'select IIO_TRIGGERED_BUFFER'.
Fixes: 6c7bc1d27bb2 ("iio: adc: ti-lmp92064: add buffering support")
Signed-off-by: Javier Carrasco <javier.carrasco.cruz(a)gmail.com>
Link: https://patch.msgid.link/20241003-iio-select-v1-6-67c0385197cd@gmail.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index 68640fa26f4e..c1197ee3dc68 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -1530,6 +1530,8 @@ config TI_LMP92064
tristate "Texas Instruments LMP92064 ADC driver"
depends on SPI
select REGMAP_SPI
+ select IIO_BUFFER
+ select IIO_TRIGGERED_BUFFER
help
Say yes here to build support for the LMP92064 Precision Current and Voltage
sensor.
Hi,
After upgrading to 6.6.57 I noticed that my IPv6 firewall config failed to load.
Quick investigation flagged NFLOG to be the issue:
# ip6tables -I INPUT -j NFLOG
Warning: Extension NFLOG revision 0 not supported, missing kernel module?
ip6tables: No chain/target/match by that name.
The regression is caused by the following commit:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/…
More precisely, the bug is in the change below:
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "NFLOG",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .checkentry = nflog_tg_check,
+ .destroy = nflog_tg_destroy,
+ .target = nflog_tg,
+ .targetsize = sizeof(struct xt_nflog_info),
+ .me = THIS_MODULE,
+ },
+#endif
Replacing NFPROTO_IPV4 with NFPROTO_IPV6 fixed the issue.
Looking at the commit, it seems that at least one more target (MARK) may be also impacted:
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_IPV4,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
+#endif
The same errors seem to be present in the main tree:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
I also suspect other -stable trees may be impacted by the same issue.
Best regards,
Krzysztof Olędzki
From: Jean-Baptiste Maneyrol <jean-baptiste.maneyrol(a)tdk.com>
When multiple ODR switch happens during FIFO off, the change could
not be taken into account if you get back to previous FIFO on value.
For example, if you run sensor buffer at 50Hz, stop, change to
200Hz, then back to 50Hz and restart buffer, data will be timestamped
at 200Hz. This due to testing against mult and not new_mult.
To prevent this, let's just run apply_odr automatically when FIFO is
off. It will also simplify driver code.
Update inv_mpu6050 and inv_icm42600 to delete now useless apply_odr.
Fixes: 95444b9eeb8c ("iio: invensense: fix odr switching to same value")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jean-Baptiste Maneyrol <jean-baptiste.maneyrol(a)tdk.com>
---
drivers/iio/common/inv_sensors/inv_sensors_timestamp.c | 4 ++++
drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c | 1 -
drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c | 1 -
drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c | 1 -
4 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c b/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c
index f44458c380d92823ce2e7e5f78ca877ea4c06118..37d0bdaa8d824f79dcd2f341be7501d249926951 100644
--- a/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c
+++ b/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c
@@ -70,6 +70,10 @@ int inv_sensors_timestamp_update_odr(struct inv_sensors_timestamp *ts,
if (mult != ts->mult)
ts->new_mult = mult;
+ /* When FIFO is off, directly apply the new ODR */
+ if (!fifo)
+ inv_sensors_timestamp_apply_odr(ts, 0, 0, 0);
+
return 0;
}
EXPORT_SYMBOL_NS_GPL(inv_sensors_timestamp_update_odr, IIO_INV_SENSORS_TIMESTAMP);
diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c
index 56ac198142500a2e1fc40b62cdd465cc736d8bf0..d061a64ebbf71859a3bc44644a14137dff0f9efe 100644
--- a/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c
+++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_accel.c
@@ -229,7 +229,6 @@ static int inv_icm42600_accel_update_scan_mode(struct iio_dev *indio_dev,
}
/* update data FIFO write */
- inv_sensors_timestamp_apply_odr(ts, 0, 0, 0);
ret = inv_icm42600_buffer_set_fifo_en(st, fifo_en | st->fifo.en);
out_unlock:
diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c
index 938af5b640b00f58d2b8185f752c4755edfb0d25..f1e5a9648c4f5dd34f40136d02c72c90473eff37 100644
--- a/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c
+++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_gyro.c
@@ -128,7 +128,6 @@ static int inv_icm42600_gyro_update_scan_mode(struct iio_dev *indio_dev,
}
/* update data FIFO write */
- inv_sensors_timestamp_apply_odr(ts, 0, 0, 0);
ret = inv_icm42600_buffer_set_fifo_en(st, fifo_en | st->fifo.en);
out_unlock:
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
index 3bfeabab0ec4f6fa28fbbcd47afe92af5b8a58e2..5b1088cc3704f1ad1288a0d65b2f957b91455d7f 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
@@ -112,7 +112,6 @@ int inv_mpu6050_prepare_fifo(struct inv_mpu6050_state *st, bool enable)
if (enable) {
/* reset timestamping */
inv_sensors_timestamp_reset(&st->timestamp);
- inv_sensors_timestamp_apply_odr(&st->timestamp, 0, 0, 0);
/* reset FIFO */
d = st->chip_config.user_ctrl | INV_MPU6050_BIT_FIFO_RST;
ret = regmap_write(st->map, st->reg->user_ctrl, d);
---
base-commit: c3e9df514041ec6c46be83801b1891392f4522f7
change-id: 20241017-invn-inv-sensors-timestamp-fix-switch-fifo-off-3f29110e95d0
Best regards,
--
Jean-Baptiste Maneyrol <jean-baptiste.maneyrol(a)tdk.com>
From: Zijun Hu <quic_zijuhu(a)quicinc.com>
For devm_pci_epc_destroy(), its comment says it needs to destroy the EPC
device, but it does not do that actually, so it can not fully undo what
the API devm_pci_epc_create() does, that is wrong, fixed by using
devres_release() instead of devres_destroy() within the API.
Fixes: 5e8cb4033807 ("PCI: endpoint: Add EP core layer to enable EP controller and EP functions")
Cc: stable(a)vger.kernel.org
Signed-off-by: Zijun Hu <quic_zijuhu(a)quicinc.com>
---
drivers/pci/endpoint/pci-epc-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 17f007109255..71b6d100056e 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -857,7 +857,7 @@ void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc)
{
int r;
- r = devres_destroy(dev, devm_pci_epc_release, devm_pci_epc_match,
+ r = devres_release(dev, devm_pci_epc_release, devm_pci_epc_match,
epc);
dev_WARN_ONCE(dev, r, "couldn't find PCI EPC resource\n");
}
---
base-commit: 715ca9dd687f89ddaac8ec8ccb3b5e5a30311a99
change-id: 20241020-pci-epc-core_fix-a92512fa9d19
Best regards,
--
Zijun Hu <quic_zijuhu(a)quicinc.com>
From: Zijun Hu <quic_zijuhu(a)quicinc.com>
For devm_usb_put_phy(), its comment says it needs to invoke usb_put_phy()
to release the phy, but it does not do that actually, so it can not fully
undo what the API devm_usb_get_phy() does, that is wrong, fixed by using
devres_release() instead of devres_destroy() within the API.
Fixes: cedf8602373a ("usb: phy: move bulk of otg/otg.c to phy/phy.c")
Cc: stable(a)vger.kernel.org
Signed-off-by: Zijun Hu <quic_zijuhu(a)quicinc.com>
---
The API is directly used by drivers/usb/musb/sunxi.c, sorry for that
i can't evaluate relevant impact since i know nothing about sunxi.
---
drivers/usb/phy/phy.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/phy/phy.c b/drivers/usb/phy/phy.c
index 06e0fb23566c..06f789097989 100644
--- a/drivers/usb/phy/phy.c
+++ b/drivers/usb/phy/phy.c
@@ -628,7 +628,7 @@ void devm_usb_put_phy(struct device *dev, struct usb_phy *phy)
{
int r;
- r = devres_destroy(dev, devm_usb_phy_release, devm_usb_phy_match, phy);
+ r = devres_release(dev, devm_usb_phy_release, devm_usb_phy_match, phy);
dev_WARN_ONCE(dev, r, "couldn't find PHY resource\n");
}
EXPORT_SYMBOL_GPL(devm_usb_put_phy);
---
base-commit: 07b887f8236eb3ed52f1fe83e385e6436dc4b052
change-id: 20241020-usb_phy_fix-9d7c67ef4ab4
Best regards,
--
Zijun Hu <quic_zijuhu(a)quicinc.com>
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 8fa075804cb3b00960dd5c06554308175c834530
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102014-dorsal-renounce-5242@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8fa075804cb3b00960dd5c06554308175c834530 Mon Sep 17 00:00:00 2001
From: Peter Wang <peter.wang(a)mediatek.com>
Date: Tue, 1 Oct 2024 17:19:17 +0800
Subject: [PATCH] scsi: ufs: core: Requeue aborted request
After the SQ cleanup fix, the CQ will receive a response with the
corresponding tag marked as OCS: ABORTED. To align with the behavior of
Legacy SDB mode, the handling of OCS: ABORTED has been changed to match
that of OCS_INVALID_COMMAND_STATUS (SDB), with both returning a SCSI
result of DID_REQUEUE.
Furthermore, the workaround implemented before the SQ cleanup fix can be
removed.
Fixes: ab248643d3d6 ("scsi: ufs: core: Add error handling for MCQ mode")
Cc: stable(a)vger.kernel.org
Signed-off-by: Peter Wang <peter.wang(a)mediatek.com>
Link: https://lore.kernel.org/r/20241001091917.6917-3-peter.wang@mediatek.com
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 6a71ebf953e2..f845166dc0d7 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -5416,10 +5416,12 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
}
break;
case OCS_ABORTED:
- result |= DID_ABORT << 16;
- break;
case OCS_INVALID_COMMAND_STATUS:
result |= DID_REQUEUE << 16;
+ dev_warn(hba->dev,
+ "OCS %s from controller for tag %d\n",
+ (ocs == OCS_ABORTED ? "aborted" : "invalid"),
+ lrbp->task_tag);
break;
case OCS_INVALID_CMD_TABLE_ATTR:
case OCS_INVALID_PRDT_ATTR:
@@ -6465,26 +6467,12 @@ static bool ufshcd_abort_one(struct request *rq, void *priv)
struct scsi_device *sdev = cmd->device;
struct Scsi_Host *shost = sdev->host;
struct ufs_hba *hba = shost_priv(shost);
- struct ufshcd_lrb *lrbp = &hba->lrb[tag];
- struct ufs_hw_queue *hwq;
- unsigned long flags;
*ret = ufshcd_try_to_abort_task(hba, tag);
dev_err(hba->dev, "Aborting tag %d / CDB %#02x %s\n", tag,
hba->lrb[tag].cmd ? hba->lrb[tag].cmd->cmnd[0] : -1,
*ret ? "failed" : "succeeded");
- /* Release cmd in MCQ mode if abort succeeds */
- if (hba->mcq_enabled && (*ret == 0)) {
- hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(lrbp->cmd));
- if (!hwq)
- return 0;
- spin_lock_irqsave(&hwq->cq_lock, flags);
- if (ufshcd_cmd_inflight(lrbp->cmd))
- ufshcd_release_scsi_cmd(hba, lrbp);
- spin_unlock_irqrestore(&hwq->cq_lock, flags);
- }
-
return *ret == 0;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 19a198b67767d952c8f3d0cf24eb3100522a8223
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102050-unequal-radiator-f679@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 19a198b67767d952c8f3d0cf24eb3100522a8223 Mon Sep 17 00:00:00 2001
From: Seunghwan Baek <sh8267.baek(a)samsung.com>
Date: Thu, 29 Aug 2024 18:39:13 +0900
Subject: [PATCH] scsi: ufs: core: Set SDEV_OFFLINE when UFS is shut down
There is a history of deadlock if reboot is performed at the beginning
of booting. SDEV_QUIESCE was set for all LU's scsi_devices by UFS
shutdown, and at that time the audio driver was waiting on
blk_mq_submit_bio() holding a mutex_lock while reading the fw binary.
After that, a deadlock issue occurred while audio driver shutdown was
waiting for mutex_unlock of blk_mq_submit_bio(). To solve this, set
SDEV_OFFLINE for all LUs except WLUN, so that any I/O that comes down
after a UFS shutdown will return an error.
[ 31.907781]I[0: swapper/0: 0] 1 130705007 1651079834 11289729804 0 D( 2) 3 ffffff882e208000 * init [device_shutdown]
[ 31.907793]I[0: swapper/0: 0] Mutex: 0xffffff8849a2b8b0: owner[0xffffff882e28cb00 kworker/6:0 :49]
[ 31.907806]I[0: swapper/0: 0] Call trace:
[ 31.907810]I[0: swapper/0: 0] __switch_to+0x174/0x338
[ 31.907819]I[0: swapper/0: 0] __schedule+0x5ec/0x9cc
[ 31.907826]I[0: swapper/0: 0] schedule+0x7c/0xe8
[ 31.907834]I[0: swapper/0: 0] schedule_preempt_disabled+0x24/0x40
[ 31.907842]I[0: swapper/0: 0] __mutex_lock+0x408/0xdac
[ 31.907849]I[0: swapper/0: 0] __mutex_lock_slowpath+0x14/0x24
[ 31.907858]I[0: swapper/0: 0] mutex_lock+0x40/0xec
[ 31.907866]I[0: swapper/0: 0] device_shutdown+0x108/0x280
[ 31.907875]I[0: swapper/0: 0] kernel_restart+0x4c/0x11c
[ 31.907883]I[0: swapper/0: 0] __arm64_sys_reboot+0x15c/0x280
[ 31.907890]I[0: swapper/0: 0] invoke_syscall+0x70/0x158
[ 31.907899]I[0: swapper/0: 0] el0_svc_common+0xb4/0xf4
[ 31.907909]I[0: swapper/0: 0] do_el0_svc+0x2c/0xb0
[ 31.907918]I[0: swapper/0: 0] el0_svc+0x34/0xe0
[ 31.907928]I[0: swapper/0: 0] el0t_64_sync_handler+0x68/0xb4
[ 31.907937]I[0: swapper/0: 0] el0t_64_sync+0x1a0/0x1a4
[ 31.908774]I[0: swapper/0: 0] 49 0 11960702 11236868007 0 D( 2) 6 ffffff882e28cb00 * kworker/6:0 [__bio_queue_enter]
[ 31.908783]I[0: swapper/0: 0] Call trace:
[ 31.908788]I[0: swapper/0: 0] __switch_to+0x174/0x338
[ 31.908796]I[0: swapper/0: 0] __schedule+0x5ec/0x9cc
[ 31.908803]I[0: swapper/0: 0] schedule+0x7c/0xe8
[ 31.908811]I[0: swapper/0: 0] __bio_queue_enter+0xb8/0x178
[ 31.908818]I[0: swapper/0: 0] blk_mq_submit_bio+0x194/0x67c
[ 31.908827]I[0: swapper/0: 0] __submit_bio+0xb8/0x19c
Fixes: b294ff3e3449 ("scsi: ufs: core: Enable power management for wlun")
Cc: stable(a)vger.kernel.org
Signed-off-by: Seunghwan Baek <sh8267.baek(a)samsung.com>
Link: https://lore.kernel.org/r/20240829093913.6282-2-sh8267.baek@samsung.com
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index f845166dc0d7..706dc81eb924 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -10197,7 +10197,9 @@ static void ufshcd_wl_shutdown(struct device *dev)
shost_for_each_device(sdev, hba->host) {
if (sdev == hba->ufs_device_wlun)
continue;
- scsi_device_quiesce(sdev);
+ mutex_lock(&sdev->state_mutex);
+ scsi_device_set_state(sdev, SDEV_OFFLINE);
+ mutex_unlock(&sdev->state_mutex);
}
__ufshcd_wl_suspend(hba, UFS_SHUTDOWN_PM);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x e972b08b91ef48488bae9789f03cfedb148667fb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102019-roundness-penholder-bb3f@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e972b08b91ef48488bae9789f03cfedb148667fb Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov(a)fb.com>
Date: Tue, 15 Oct 2024 10:59:46 -0700
Subject: [PATCH] blk-rq-qos: fix crash on rq_qos_wait vs. rq_qos_wake_function
race
We're seeing crashes from rq_qos_wake_function that look like this:
BUG: unable to handle page fault for address: ffffafe180a40084
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 100000067 P4D 100000067 PUD 10027c067 PMD 10115d067 PTE 0
Oops: Oops: 0002 [#1] PREEMPT SMP PTI
CPU: 17 UID: 0 PID: 0 Comm: swapper/17 Not tainted 6.12.0-rc3-00013-geca631b8fe80 #11
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
RIP: 0010:_raw_spin_lock_irqsave+0x1d/0x40
Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 41 54 9c 41 5c fa 65 ff 05 62 97 30 4c 31 c0 ba 01 00 00 00 <f0> 0f b1 17 75 0a 4c 89 e0 41 5c c3 cc cc cc cc 89 c6 e8 2c 0b 00
RSP: 0018:ffffafe180580ca0 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffffafe180a3f7a8 RCX: 0000000000000011
RDX: 0000000000000001 RSI: 0000000000000003 RDI: ffffafe180a40084
RBP: 0000000000000000 R08: 00000000001e7240 R09: 0000000000000011
R10: 0000000000000028 R11: 0000000000000888 R12: 0000000000000002
R13: ffffafe180a40084 R14: 0000000000000000 R15: 0000000000000003
FS: 0000000000000000(0000) GS:ffff9aaf1f280000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffafe180a40084 CR3: 000000010e428002 CR4: 0000000000770ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<IRQ>
try_to_wake_up+0x5a/0x6a0
rq_qos_wake_function+0x71/0x80
__wake_up_common+0x75/0xa0
__wake_up+0x36/0x60
scale_up.part.0+0x50/0x110
wb_timer_fn+0x227/0x450
...
So rq_qos_wake_function() calls wake_up_process(data->task), which calls
try_to_wake_up(), which faults in raw_spin_lock_irqsave(&p->pi_lock).
p comes from data->task, and data comes from the waitqueue entry, which
is stored on the waiter's stack in rq_qos_wait(). Analyzing the core
dump with drgn, I found that the waiter had already woken up and moved
on to a completely unrelated code path, clobbering what was previously
data->task. Meanwhile, the waker was passing the clobbered garbage in
data->task to wake_up_process(), leading to the crash.
What's happening is that in between rq_qos_wake_function() deleting the
waitqueue entry and calling wake_up_process(), rq_qos_wait() is finding
that it already got a token and returning. The race looks like this:
rq_qos_wait() rq_qos_wake_function()
==============================================================
prepare_to_wait_exclusive()
data->got_token = true;
list_del_init(&curr->entry);
if (data.got_token)
break;
finish_wait(&rqw->wait, &data.wq);
^- returns immediately because
list_empty_careful(&wq_entry->entry)
is true
... return, go do something else ...
wake_up_process(data->task)
(NO LONGER VALID!)-^
Normally, finish_wait() is supposed to synchronize against the waker.
But, as noted above, it is returning immediately because the waitqueue
entry has already been removed from the waitqueue.
The bug is that rq_qos_wake_function() is accessing the waitqueue entry
AFTER deleting it. Note that autoremove_wake_function() wakes the waiter
and THEN deletes the waitqueue entry, which is the proper order.
Fix it by swapping the order. We also need to use
list_del_init_careful() to match the list_empty_careful() in
finish_wait().
Fixes: 38cfb5a45ee0 ("blk-wbt: improve waking of tasks")
Cc: stable(a)vger.kernel.org
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Link: https://lore.kernel.org/r/d3bee2463a67b1ee597211823bf7ad3721c26e41.17290145…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 2cfb297d9a62..058f92c4f9d5 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -219,8 +219,8 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
data->got_token = true;
smp_wmb();
- list_del_init(&curr->entry);
wake_up_process(data->task);
+ list_del_init_careful(&curr->entry);
return 1;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 2c02f7375e658ae93d57a31a66f91b62754ef8f1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102001-badly-overvalue-6662@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Fri, 18 Oct 2024 21:43:00 -0400
Subject: [PATCH] fgraph: Use CPU hotplug mechanism to initialize idle shadow
stacks
The function graph infrastructure allocates a shadow stack for every task
when enabled. This includes the idle tasks. The first time the function
graph is invoked, the shadow stacks are created and never freed until the
task exits. This includes the idle tasks.
Only the idle tasks that were for online CPUs had their shadow stacks
created when function graph tracing started. If function graph tracing is
enabled and a CPU comes online, the idle task representing that CPU will
not have its shadow stack created, and all function graph tracing for that
idle task will be silently dropped.
Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks.
This will include idle tasks for CPUs that come online during tracing.
This issue can be reproduced by:
# cd /sys/kernel/tracing
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > set_ftrace_pid
# echo function_graph > current_tracer
# echo 1 > options/funcgraph-proc
# echo 1 > /sys/devices/system/cpu/cpu1
# grep '<idle>' per_cpu/cpu1/trace | head
Before, nothing would show up.
After:
1) <idle>-0 | 0.811 us | __enqueue_entity();
1) <idle>-0 | 5.626 us | } /* enqueue_entity */
1) <idle>-0 | | dl_server_update_idle_time() {
1) <idle>-0 | | dl_scaled_delta_exec() {
1) <idle>-0 | 0.450 us | arch_scale_cpu_capacity();
1) <idle>-0 | 1.242 us | }
1) <idle>-0 | 1.908 us | }
1) <idle>-0 | | dl_server_start() {
1) <idle>-0 | | enqueue_dl_entity() {
1) <idle>-0 | | task_contending() {
Note, if tracing stops and restarts, the old way would then initialize
the onlined CPUs.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/20241018214300.6df82178@rorschach
Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..43f4e3f57438 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret, cpu;
+ int ret;
ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch)
fgraph_direct_gops = &fgraph_stub;
}
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
int command = 0;
int ret = 0;
int i = -1;
mutex_lock(&ftrace_lock);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2c02f7375e658ae93d57a31a66f91b62754ef8f1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102000-mortician-chant-190e@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Fri, 18 Oct 2024 21:43:00 -0400
Subject: [PATCH] fgraph: Use CPU hotplug mechanism to initialize idle shadow
stacks
The function graph infrastructure allocates a shadow stack for every task
when enabled. This includes the idle tasks. The first time the function
graph is invoked, the shadow stacks are created and never freed until the
task exits. This includes the idle tasks.
Only the idle tasks that were for online CPUs had their shadow stacks
created when function graph tracing started. If function graph tracing is
enabled and a CPU comes online, the idle task representing that CPU will
not have its shadow stack created, and all function graph tracing for that
idle task will be silently dropped.
Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks.
This will include idle tasks for CPUs that come online during tracing.
This issue can be reproduced by:
# cd /sys/kernel/tracing
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > set_ftrace_pid
# echo function_graph > current_tracer
# echo 1 > options/funcgraph-proc
# echo 1 > /sys/devices/system/cpu/cpu1
# grep '<idle>' per_cpu/cpu1/trace | head
Before, nothing would show up.
After:
1) <idle>-0 | 0.811 us | __enqueue_entity();
1) <idle>-0 | 5.626 us | } /* enqueue_entity */
1) <idle>-0 | | dl_server_update_idle_time() {
1) <idle>-0 | | dl_scaled_delta_exec() {
1) <idle>-0 | 0.450 us | arch_scale_cpu_capacity();
1) <idle>-0 | 1.242 us | }
1) <idle>-0 | 1.908 us | }
1) <idle>-0 | | dl_server_start() {
1) <idle>-0 | | enqueue_dl_entity() {
1) <idle>-0 | | task_contending() {
Note, if tracing stops and restarts, the old way would then initialize
the onlined CPUs.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/20241018214300.6df82178@rorschach
Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..43f4e3f57438 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret, cpu;
+ int ret;
ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch)
fgraph_direct_gops = &fgraph_stub;
}
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
int command = 0;
int ret = 0;
int i = -1;
mutex_lock(&ftrace_lock);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 2c02f7375e658ae93d57a31a66f91b62754ef8f1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102058-headphone-embody-6747@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Fri, 18 Oct 2024 21:43:00 -0400
Subject: [PATCH] fgraph: Use CPU hotplug mechanism to initialize idle shadow
stacks
The function graph infrastructure allocates a shadow stack for every task
when enabled. This includes the idle tasks. The first time the function
graph is invoked, the shadow stacks are created and never freed until the
task exits. This includes the idle tasks.
Only the idle tasks that were for online CPUs had their shadow stacks
created when function graph tracing started. If function graph tracing is
enabled and a CPU comes online, the idle task representing that CPU will
not have its shadow stack created, and all function graph tracing for that
idle task will be silently dropped.
Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks.
This will include idle tasks for CPUs that come online during tracing.
This issue can be reproduced by:
# cd /sys/kernel/tracing
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > set_ftrace_pid
# echo function_graph > current_tracer
# echo 1 > options/funcgraph-proc
# echo 1 > /sys/devices/system/cpu/cpu1
# grep '<idle>' per_cpu/cpu1/trace | head
Before, nothing would show up.
After:
1) <idle>-0 | 0.811 us | __enqueue_entity();
1) <idle>-0 | 5.626 us | } /* enqueue_entity */
1) <idle>-0 | | dl_server_update_idle_time() {
1) <idle>-0 | | dl_scaled_delta_exec() {
1) <idle>-0 | 0.450 us | arch_scale_cpu_capacity();
1) <idle>-0 | 1.242 us | }
1) <idle>-0 | 1.908 us | }
1) <idle>-0 | | dl_server_start() {
1) <idle>-0 | | enqueue_dl_entity() {
1) <idle>-0 | | task_contending() {
Note, if tracing stops and restarts, the old way would then initialize
the onlined CPUs.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/20241018214300.6df82178@rorschach
Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..43f4e3f57438 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret, cpu;
+ int ret;
ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch)
fgraph_direct_gops = &fgraph_stub;
}
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
int command = 0;
int ret = 0;
int i = -1;
mutex_lock(&ftrace_lock);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2c02f7375e658ae93d57a31a66f91b62754ef8f1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102057-skipper-growl-db34@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Fri, 18 Oct 2024 21:43:00 -0400
Subject: [PATCH] fgraph: Use CPU hotplug mechanism to initialize idle shadow
stacks
The function graph infrastructure allocates a shadow stack for every task
when enabled. This includes the idle tasks. The first time the function
graph is invoked, the shadow stacks are created and never freed until the
task exits. This includes the idle tasks.
Only the idle tasks that were for online CPUs had their shadow stacks
created when function graph tracing started. If function graph tracing is
enabled and a CPU comes online, the idle task representing that CPU will
not have its shadow stack created, and all function graph tracing for that
idle task will be silently dropped.
Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks.
This will include idle tasks for CPUs that come online during tracing.
This issue can be reproduced by:
# cd /sys/kernel/tracing
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > set_ftrace_pid
# echo function_graph > current_tracer
# echo 1 > options/funcgraph-proc
# echo 1 > /sys/devices/system/cpu/cpu1
# grep '<idle>' per_cpu/cpu1/trace | head
Before, nothing would show up.
After:
1) <idle>-0 | 0.811 us | __enqueue_entity();
1) <idle>-0 | 5.626 us | } /* enqueue_entity */
1) <idle>-0 | | dl_server_update_idle_time() {
1) <idle>-0 | | dl_scaled_delta_exec() {
1) <idle>-0 | 0.450 us | arch_scale_cpu_capacity();
1) <idle>-0 | 1.242 us | }
1) <idle>-0 | 1.908 us | }
1) <idle>-0 | | dl_server_start() {
1) <idle>-0 | | enqueue_dl_entity() {
1) <idle>-0 | | task_contending() {
Note, if tracing stops and restarts, the old way would then initialize
the onlined CPUs.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/20241018214300.6df82178@rorschach
Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..43f4e3f57438 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret, cpu;
+ int ret;
ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch)
fgraph_direct_gops = &fgraph_stub;
}
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
int command = 0;
int ret = 0;
int i = -1;
mutex_lock(&ftrace_lock);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 2c02f7375e658ae93d57a31a66f91b62754ef8f1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102055-nugget-delicious-edfe@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Fri, 18 Oct 2024 21:43:00 -0400
Subject: [PATCH] fgraph: Use CPU hotplug mechanism to initialize idle shadow
stacks
The function graph infrastructure allocates a shadow stack for every task
when enabled. This includes the idle tasks. The first time the function
graph is invoked, the shadow stacks are created and never freed until the
task exits. This includes the idle tasks.
Only the idle tasks that were for online CPUs had their shadow stacks
created when function graph tracing started. If function graph tracing is
enabled and a CPU comes online, the idle task representing that CPU will
not have its shadow stack created, and all function graph tracing for that
idle task will be silently dropped.
Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks.
This will include idle tasks for CPUs that come online during tracing.
This issue can be reproduced by:
# cd /sys/kernel/tracing
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > set_ftrace_pid
# echo function_graph > current_tracer
# echo 1 > options/funcgraph-proc
# echo 1 > /sys/devices/system/cpu/cpu1
# grep '<idle>' per_cpu/cpu1/trace | head
Before, nothing would show up.
After:
1) <idle>-0 | 0.811 us | __enqueue_entity();
1) <idle>-0 | 5.626 us | } /* enqueue_entity */
1) <idle>-0 | | dl_server_update_idle_time() {
1) <idle>-0 | | dl_scaled_delta_exec() {
1) <idle>-0 | 0.450 us | arch_scale_cpu_capacity();
1) <idle>-0 | 1.242 us | }
1) <idle>-0 | 1.908 us | }
1) <idle>-0 | | dl_server_start() {
1) <idle>-0 | | enqueue_dl_entity() {
1) <idle>-0 | | task_contending() {
Note, if tracing stops and restarts, the old way would then initialize
the onlined CPUs.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/20241018214300.6df82178@rorschach
Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..43f4e3f57438 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret, cpu;
+ int ret;
ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch)
fgraph_direct_gops = &fgraph_stub;
}
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
int command = 0;
int ret = 0;
int i = -1;
mutex_lock(&ftrace_lock);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 2c02f7375e658ae93d57a31a66f91b62754ef8f1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024102054-nineteen-exemplary-3f78@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2c02f7375e658ae93d57a31a66f91b62754ef8f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt(a)goodmis.org>
Date: Fri, 18 Oct 2024 21:43:00 -0400
Subject: [PATCH] fgraph: Use CPU hotplug mechanism to initialize idle shadow
stacks
The function graph infrastructure allocates a shadow stack for every task
when enabled. This includes the idle tasks. The first time the function
graph is invoked, the shadow stacks are created and never freed until the
task exits. This includes the idle tasks.
Only the idle tasks that were for online CPUs had their shadow stacks
created when function graph tracing started. If function graph tracing is
enabled and a CPU comes online, the idle task representing that CPU will
not have its shadow stack created, and all function graph tracing for that
idle task will be silently dropped.
Instead, use the CPU hotplug mechanism to allocate the idle shadow stacks.
This will include idle tasks for CPUs that come online during tracing.
This issue can be reproduced by:
# cd /sys/kernel/tracing
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 0 > set_ftrace_pid
# echo function_graph > current_tracer
# echo 1 > options/funcgraph-proc
# echo 1 > /sys/devices/system/cpu/cpu1
# grep '<idle>' per_cpu/cpu1/trace | head
Before, nothing would show up.
After:
1) <idle>-0 | 0.811 us | __enqueue_entity();
1) <idle>-0 | 5.626 us | } /* enqueue_entity */
1) <idle>-0 | | dl_server_update_idle_time() {
1) <idle>-0 | | dl_scaled_delta_exec() {
1) <idle>-0 | 0.450 us | arch_scale_cpu_capacity();
1) <idle>-0 | 1.242 us | }
1) <idle>-0 | 1.908 us | }
1) <idle>-0 | | dl_server_start() {
1) <idle>-0 | | enqueue_dl_entity() {
1) <idle>-0 | | task_contending() {
Note, if tracing stops and restarts, the old way would then initialize
the onlined CPUs.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/20241018214300.6df82178@rorschach
Fixes: 868baf07b1a25 ("ftrace: Fix memory leak with function graph and cpu hotplug")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d7d4fb403f6f..43f4e3f57438 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1160,19 +1160,13 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret, cpu;
+ int ret;
ret_stack_list = kmalloc(SHADOW_STACK_SIZE, GFP_KERNEL);
if (!ret_stack_list)
return -ENOMEM;
- /* The cpu_boot init_task->ret_stack will never be freed */
- for_each_online_cpu(cpu) {
- if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_idle_task(idle_task(cpu), cpu);
- }
-
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
@@ -1242,14 +1236,34 @@ static void ftrace_graph_disable_direct(bool disable_branch)
fgraph_direct_gops = &fgraph_stub;
}
+/* The cpu_boot init_task->ret_stack will never be freed */
+static int fgraph_cpu_init(unsigned int cpu)
+{
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ return 0;
+}
+
int register_ftrace_graph(struct fgraph_ops *gops)
{
+ static bool fgraph_initialized;
int command = 0;
int ret = 0;
int i = -1;
mutex_lock(&ftrace_lock);
+ if (!fgraph_initialized) {
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "fgraph_idle_init",
+ fgraph_cpu_init, NULL);
+ if (ret < 0) {
+ pr_warn("fgraph: Error to init cpu hotplug support\n");
+ return ret;
+ }
+ fgraph_initialized = true;
+ ret = 0;
+ }
+
if (!fgraph_array[0]) {
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
This patch series is to fix bugs for below APIs:
devm_phy_put()
devm_of_phy_provider_unregister()
devm_phy_destroy()
phy_get()
of_phy_get()
devm_of_phy_get_by_index()
And simplify API of_phy_simple_xlate().
Signed-off-by: Zijun Hu <quic_zijuhu(a)quicinc.com>
---
Zijun Hu (6):
phy: core: Fix API devm_phy_put() can not release the phy
phy: core: Fix API devm_of_phy_provider_unregister() can not unregister the phy provider
phy: core: Fix API devm_phy_destroy() can not destroy the phy
phy: core: Add missing of_node_put() for an error handling path of _of_phy_get()
phy: core: Add missing of_node_put() in of_phy_provider_lookup()
phy: core: Simplify API of_phy_simple_xlate() implementation
drivers/phy/phy-core.c | 39 ++++++++++++++++++---------------------
1 file changed, 18 insertions(+), 21 deletions(-)
---
base-commit: d8f9d6d826fc15780451802796bb88ec52978f17
change-id: 20241020-phy_core_fix-e3ad65db98f7
Best regards,
--
Zijun Hu <quic_zijuhu(a)quicinc.com>
Some page flags (page->flags) were converted to page types
(page->page_types). A recent example is PG_hugetlb.
From the exclusive writer's perspective, e.g., a thread doing
__folio_set_hugetlb(), there is a difference between the page flag and
type APIs: the former allows the same non-atomic operation to be
repeated whereas the latter does not. For example, calling
__folio_set_hugetlb() twice triggers VM_BUG_ON_FOLIO(), since the
second call expects the type (PG_hugetlb) not to be set previously.
Using add_hugetlb_folio() as an example, it calls
__folio_set_hugetlb() in the following error-handling path. And when
that happens, it triggers the aforementioned VM_BUG_ON_FOLIO().
if (folio_test_hugetlb(folio)) {
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, false);
...
It is possible to make hugeTLB comply with the new requirements from
the page type API. However, a straightforward fix would be to just
allow the same page type to be set or cleared again inside the API,
to avoid any changes to its callers.
Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Cc: <stable(a)vger.kernel.org>
---
include/linux/page-flags.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ccf3c78faefc..e80665bc51fa 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -977,12 +977,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
+ if (folio_test_##fname(folio)) \
+ return; \
VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
folio); \
folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
+ if (folio->page.page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type = UINT_MAX; \
}
@@ -995,11 +999,15 @@ static __always_inline int Page##uname(const struct page *page) \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
+ if (Page##uname(page)) \
+ return; \
VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
+ if (page->page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type = UINT_MAX; \
}
--
2.47.0.rc1.288.g06298d1525-goog
tpm2_sessions_init() does not ignore the result of
tpm2_create_null_primary(). Address this by returning -ENODEV to the
caller. Given that upper layers cannot help healing the situation
further, deal with the TPM error here by
Cc: stable(a)vger.kernel.org # v6.10+
Fixes: d2add27cf2b8 ("tpm: Add NULL primary creation")
Signed-off-by: Jarkko Sakkinen <jarkko(a)kernel.org>
---
v6:
- Address:
https://lore.kernel.org/linux-integrity/69c893e7-6b87-4daa-80db-44d1120e80f…
as TPM RC is taken care of at the call site. Add also the missing
documentation for the return values.
v5:
- Do not print klog messages on error, as tpm2_save_context() already
takes care of this.
v4:
- Fixed up stable version.
v3:
- Handle TPM and POSIX error separately and return -ENODEV always back
to the caller.
v2:
- Refined the commit message.
---
drivers/char/tpm/tpm2-sessions.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 511c67061728..253639767c1e 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -1347,6 +1347,11 @@ static int tpm2_create_null_primary(struct tpm_chip *chip)
*
* Derive and context save the null primary and allocate memory in the
* struct tpm_chip for the authorizations.
+ *
+ * Return:
+ * * 0 - OK
+ * * -errno - A system error
+ * * TPM_RC - A TPM error
*/
int tpm2_sessions_init(struct tpm_chip *chip)
{
@@ -1354,7 +1359,7 @@ int tpm2_sessions_init(struct tpm_chip *chip)
rc = tpm2_create_null_primary(chip);
if (rc)
- dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
+ return rc;
chip->auth = kmalloc(sizeof(*chip->auth), GFP_KERNEL);
if (!chip->auth)
--
2.47.0
Returning an abort to the guest for an unsupported MMIO access is a
documented feature of the KVM UAPI. Nevertheless, it's clear that this
plumbing has seen limited testing, since userspace can trivially cause a
WARN in the MMIO return:
WARNING: CPU: 0 PID: 30558 at arch/arm64/include/asm/kvm_emulate.h:536 kvm_handle_mmio_return+0x46c/0x5c4 arch/arm64/include/asm/kvm_emulate.h:536
Call trace:
kvm_handle_mmio_return+0x46c/0x5c4 arch/arm64/include/asm/kvm_emulate.h:536
kvm_arch_vcpu_ioctl_run+0x98/0x15b4 arch/arm64/kvm/arm.c:1133
kvm_vcpu_ioctl+0x75c/0xa78 virt/kvm/kvm_main.c:4487
__do_sys_ioctl fs/ioctl.c:51 [inline]
__se_sys_ioctl fs/ioctl.c:893 [inline]
__arm64_sys_ioctl+0x14c/0x1c8 fs/ioctl.c:893
__invoke_syscall arch/arm64/kernel/syscall.c:35 [inline]
invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49
el0_svc_common+0x1e0/0x23c arch/arm64/kernel/syscall.c:132
do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151
el0_svc+0x38/0x68 arch/arm64/kernel/entry-common.c:712
el0t_64_sync_handler+0x90/0xfc arch/arm64/kernel/entry-common.c:730
el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598
The splat is complaining that KVM is advancing PC while an exception is
pending, i.e. that KVM is retiring the MMIO instruction despite a
pending external abort. Womp womp.
Fix the glaring UAPI bug by skipping over all the MMIO emulation in
case there is a pending synchronous exception. Note that while userspace
is capable of pending an asynchronous exception (SError, IRQ, or FIQ),
it is still safe to retire the MMIO instruction in this case as (1) they
are by definition asynchronous, and (2) KVM relies on hardware support
for pending/delivering these exceptions instead of the software state
machine for advancing PC.
Cc: stable(a)vger.kernel.org
Fixes: da345174ceca ("KVM: arm/arm64: Allow user injection of external data aborts")
Reported-by: Alexander Potapenko <glider(a)google.com>
Signed-off-by: Oliver Upton <oliver.upton(a)linux.dev>
---
arch/arm64/include/asm/kvm_emulate.h | 25 +++++++++++++++++++++++++
arch/arm64/kvm/mmio.c | 7 +++++--
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index a601a9305b10..1b229099f684 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -544,6 +544,31 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
vcpu_set_flag((v), e); \
} while (0)
+static inline bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
+ return false;
+
+ if (vcpu_el1_is_32bit(vcpu)) {
+ switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
+ case unpack_vcpu_flag(EXCEPT_AA32_UND):
+ case unpack_vcpu_flag(EXCEPT_AA32_IABT):
+ case unpack_vcpu_flag(EXCEPT_AA32_DABT):
+ return true;
+ default:
+ return false;
+ }
+ } else {
+ switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
+ case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ return true;
+ default:
+ return false;
+ }
+ }
+}
+
#define __build_check_all_or_none(r, bits) \
BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits))
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
index cd6b7b83e2c3..0155ba665717 100644
--- a/arch/arm64/kvm/mmio.c
+++ b/arch/arm64/kvm/mmio.c
@@ -84,8 +84,11 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
unsigned int len;
int mask;
- /* Detect an already handled MMIO return */
- if (unlikely(!vcpu->mmio_needed))
+ /*
+ * Detect if the MMIO return was already handled or if userspace aborted
+ * the MMIO access.
+ */
+ if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu)))
return 1;
vcpu->mmio_needed = 0;
--
2.47.0.rc1.288.g06298d1525-goog
During the aborting of a command, the software receives a command
completion event for the command ring stopped, with the TRB pointing
to the next TRB after the aborted command.
If the command we abort is located just before the Link TRB in the
command ring, then during the 'command ring stopped' completion event,
the xHC gives the Link TRB in the event's cmd DMA, which causes a
mismatch in handling command completion event.
To handle this situation, an additional check has been added to ignore
the mismatch error and continue the operation.
Cc: stable(a)vger.kernel.org
Signed-off-by: Faisal Hassan <quic_faisalh(a)quicinc.com>
---
drivers/usb/host/xhci-ring.c | 38 +++++++++++++++++++++++++++++++++---
1 file changed, 35 insertions(+), 3 deletions(-)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index b2950c35c740..43926c378df9 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -126,6 +126,32 @@ static void inc_td_cnt(struct urb *urb)
urb_priv->num_tds_done++;
}
+/*
+ * Return true if the DMA is pointing to a Link TRB in the ring;
+ * otherwise, return false.
+ */
+static bool is_dma_link_trb(struct xhci_ring *ring, dma_addr_t dma)
+{
+ struct xhci_segment *seg;
+ union xhci_trb *trb;
+ dma_addr_t trb_dma;
+ int i;
+
+ seg = ring->first_seg;
+ do {
+ for (i = 0; i < TRBS_PER_SEGMENT; i++) {
+ trb = &seg->trbs[i];
+ trb_dma = seg->dma + (i * sizeof(union xhci_trb));
+
+ if (trb_is_link(trb) && trb_dma == dma)
+ return true;
+ }
+ seg = seg->next;
+ } while (seg != ring->first_seg);
+
+ return false;
+}
+
static void trb_to_noop(union xhci_trb *trb, u32 noop_type)
{
if (trb_is_link(trb)) {
@@ -1718,13 +1744,21 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
trace_xhci_handle_command(xhci->cmd_ring, &cmd_trb->generic);
+ cmd_comp_code = GET_COMP_CODE(le32_to_cpu(event->status));
cmd_dequeue_dma = xhci_trb_virt_to_dma(xhci->cmd_ring->deq_seg,
cmd_trb);
/*
* Check whether the completion event is for our internal kept
* command.
+ * For the 'command ring stopped' completion event, there is a
+ * risk of a mismatch in dequeue pointers if we abort the command
+ * just before the link TRB in the command ring. In this scenario,
+ * the cmd_dma in the event would point to a link TRB, while the
+ * software dequeue pointer circles back to the start.
*/
- if (!cmd_dequeue_dma || cmd_dma != (u64)cmd_dequeue_dma) {
+ if ((!cmd_dequeue_dma || cmd_dma != (u64)cmd_dequeue_dma) &&
+ !(cmd_comp_code == COMP_COMMAND_RING_STOPPED &&
+ is_dma_link_trb(xhci->cmd_ring, cmd_dma))) {
xhci_warn(xhci,
"ERROR mismatched command completion event\n");
return;
@@ -1734,8 +1768,6 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
cancel_delayed_work(&xhci->cmd_timer);
- cmd_comp_code = GET_COMP_CODE(le32_to_cpu(event->status));
-
/* If CMD ring stopped we own the trbs between enqueue and dequeue */
if (cmd_comp_code == COMP_COMMAND_RING_STOPPED) {
complete_all(&xhci->cmd_ring_stop_completion);
--
2.17.1
Since Linux 6.11 we support AT_EMPTY_PATH and NULL path for fstatat and
statx in "some circumstances" mostly for performance and allowing
seccomp audition. But to make the API easier to be documented and used,
we should just treat AT_EMPTY_PATH and NULL as is AT_EMPTY_PATH and
empty string even if there are no performance or seccomp benefits.
Cc: Miao Wang <shankerwangmiao(a)gmail.com>
Cc: linux-fsdevel(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
Xi Ruoyao (2):
vfs: support fstatat(..., NULL, AT_EMPTY_PATH | AT_NO_AUTOMOUNT, ...)
vfs: Make sure {statx,fstatat}(..., AT_EMPTY_PATH | ..., NULL, ...)
behave as (..., AT_EMPTY_PATH | ..., "", ...)
fs/stat.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
--
2.46.2
When an i915 PMU counter is enabled and the driver is then unbound, the
PMU will be unregistered via perf_pmu_unregister(), however the event
will still be alive. i915 currently tries to deal with this situation
by:
a) Marking the pmu as "closed" and shortcut the calls from perf
b) Taking a reference from i915, that is put back when the event
is destroyed.
c) Setting event_init to NULL to avoid any further event
(a) is ugly, but may be left as is since it protects not trying to
access the HW that is now gone. Unless a pmu driver can call
perf_pmu_unregister() and not receive any more calls, it's a necessary
ugliness.
(b) doesn't really work: when the event is destroyed and the i915 ref is
put it may free the i915 object, that contains the pmu, not only the
event. After event->destroy() callback, perf still expects the pmu
object to be alive.
Instead of pigging back on the event->destroy() to take and put the
device reference, implement the new get()/put() on the pmu object for
that purpose.
(c) is only done to have a flag to avoid some function entrypoints when
pmu is unregistered.
Cc: stable(a)vger.kernel.org # 5.11+
Signed-off-by: Lucas De Marchi <lucas.demarchi(a)intel.com>
---
drivers/gpu/drm/i915/i915_pmu.c | 36 ++++++++++++++++++++-------------
1 file changed, 22 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 4d05d98f51b8e..dc9f753369170 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -515,15 +515,6 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
-static void i915_pmu_event_destroy(struct perf_event *event)
-{
- struct i915_pmu *pmu = event_to_pmu(event);
- struct drm_i915_private *i915 = pmu_to_i915(pmu);
-
- drm_WARN_ON(&i915->drm, event->parent);
-
- drm_dev_put(&i915->drm);
-}
static int
engine_event_status(struct intel_engine_cs *engine,
@@ -629,11 +620,6 @@ static int i915_pmu_event_init(struct perf_event *event)
if (ret)
return ret;
- if (!event->parent) {
- drm_dev_get(&i915->drm);
- event->destroy = i915_pmu_event_destroy;
- }
-
return 0;
}
@@ -872,6 +858,24 @@ static int i915_pmu_event_event_idx(struct perf_event *event)
return 0;
}
+static struct pmu *i915_pmu_get(struct pmu *base)
+{
+ struct i915_pmu *pmu = container_of(base, struct i915_pmu, base);
+ struct drm_i915_private *i915 = pmu_to_i915(pmu);
+
+ drm_dev_get(&i915->drm);
+
+ return base;
+}
+
+static void i915_pmu_put(struct pmu *base)
+{
+ struct i915_pmu *pmu = container_of(base, struct i915_pmu, base);
+ struct drm_i915_private *i915 = pmu_to_i915(pmu);
+
+ drm_dev_put(&i915->drm);
+}
+
struct i915_str_attribute {
struct device_attribute attr;
const char *str;
@@ -1154,6 +1158,8 @@ static void free_pmu(struct drm_device *dev, void *res)
struct i915_pmu *pmu = res;
struct drm_i915_private *i915 = pmu_to_i915(pmu);
+ perf_pmu_free(&pmu->base);
+
free_event_attributes(pmu);
kfree(pmu->base.attr_groups);
if (IS_DGFX(i915))
@@ -1299,6 +1305,8 @@ void i915_pmu_register(struct drm_i915_private *i915)
pmu->base.stop = i915_pmu_event_stop;
pmu->base.read = i915_pmu_event_read;
pmu->base.event_idx = i915_pmu_event_event_idx;
+ pmu->base.get = i915_pmu_get;
+ pmu->base.put = i915_pmu_put;
ret = perf_pmu_register(&pmu->base, pmu->name, -1);
if (ret)
--
2.47.0
[ Upstream commit 0885ef4705607936fc36a38fd74356e1c465b023 ]
I found a regression on mm-unstable during my swap stress test, using
tmpfs to compile linux. The test OOM very soon after the make spawns many
cc processes.
It bisects down to this change: 33dfe9204f29b415bbc0abb1a50642d1ba94f5e9
(mm/gup: clear the LRU flag of a page before adding to LRU batch)
Yu Zhao propose the fix: "I think this is one of the potential side
effects -- Huge mentioned earlier about isolate_lru_folios():"
I test that with it the swap stress test no longer OOM.
Link: https://lore.kernel.org/r/CAOUHufYi9h0kz5uW3LHHS3ZrVwEq-kKp8S6N-MZUmErNAXoX…
Link: https://lkml.kernel.org/r/20240905-lru-flag-v2-1-8a2d9046c594@kernel.org
Fixes: 33dfe9204f29 ("mm/gup: clear the LRU flag of a page before adding to LRU batch")
Signed-off-by: Chris Li <chrisl(a)kernel.org>
Suggested-by: Yu Zhao <yuzhao(a)google.com>
Suggested-by: Hugh Dickins <hughd(a)google.com>
Closes: https://lore.kernel.org/all/CAF8kJuNP5iTj2p07QgHSGOJsiUfYpJ2f4R1Q5-3BN9JiD9…
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd489c1af2289..a8d61a8b68944 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4300,7 +4300,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
}
/* ineligible */
- if (zone > sc->reclaim_idx) {
+ if (!folio_test_lru(folio) || zone > sc->reclaim_idx) {
gen = folio_inc_gen(lruvec, folio, false);
list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
---
base-commit: 8e24a758d14c0b1cd42ab0aea980a1030eea811f
change-id: 20241015-stable-oom-fix-a6ab273b1817
Best regards,
--
Chris Li <chrisl(a)kernel.org>
The patch titled
Subject: Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Edward Liaw <edliaw(a)google.com>
Subject: Revert "selftests/mm: replace atomic_bool with pthread_barrier_t"
Date: Fri, 18 Oct 2024 17:17:23 +0000
This reverts commit e61ef21e27e8deed8c474e9f47f4aa7bc37e138c.
uffd_poll_thread may be called by other tests that do not initialize the
pthread_barrier, so this approach is not correct. This will revert to
using atomic_bool instead.
Link: https://lkml.kernel.org/r/20241018171734.2315053-3-edliaw@google.com
Fixes: e61ef21e27e8 ("selftests/mm: replace atomic_bool with pthread_barrier_t")
Signed-off-by: Edward Liaw <edliaw(a)google.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-common.c | 5 ++---
tools/testing/selftests/mm/uffd-common.h | 3 ++-
tools/testing/selftests/mm/uffd-unit-tests.c | 14 ++++++--------
3 files changed, 10 insertions(+), 12 deletions(-)
--- a/tools/testing/selftests/mm/uffd-common.c~revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-common.c
@@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
unsigned long long *count_verify;
uffd_test_ops_t *uffd_test_ops;
uffd_test_case_ops_t *uffd_test_case_ops;
-pthread_barrier_t ready_for_fork;
+atomic_bool ready_for_fork;
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
{
@@ -519,8 +519,7 @@ void *uffd_poll_thread(void *arg)
pollfd[1].fd = pipefd[cpu*2];
pollfd[1].events = POLLIN;
- /* Ready for parent thread to fork */
- pthread_barrier_wait(&ready_for_fork);
+ ready_for_fork = true;
for (;;) {
ret = poll(pollfd, 2, -1);
--- a/tools/testing/selftests/mm/uffd-common.h~revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-common.h
@@ -33,6 +33,7 @@
#include <inttypes.h>
#include <stdint.h>
#include <sys/random.h>
+#include <stdatomic.h>
#include "../kselftest.h"
#include "vm_util.h"
@@ -104,7 +105,7 @@ extern bool map_shared;
extern bool test_uffdio_wp;
extern unsigned long long *count_verify;
extern volatile bool test_uffdio_copy_eexist;
-extern pthread_barrier_t ready_for_fork;
+extern atomic_bool ready_for_fork;
extern uffd_test_ops_t anon_uffd_test_ops;
extern uffd_test_ops_t shmem_uffd_test_ops;
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -774,7 +774,7 @@ static void uffd_sigbus_test_common(bool
char c;
struct uffd_args args = { 0 };
- pthread_barrier_init(&ready_for_fork, NULL, 2);
+ ready_for_fork = false;
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
@@ -791,9 +791,8 @@ static void uffd_sigbus_test_common(bool
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
- /* Wait for child thread to start before forking */
- pthread_barrier_wait(&ready_for_fork);
- pthread_barrier_destroy(&ready_for_fork);
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
pid = fork();
if (pid < 0)
@@ -834,7 +833,7 @@ static void uffd_events_test_common(bool
char c;
struct uffd_args args = { 0 };
- pthread_barrier_init(&ready_for_fork, NULL, 2);
+ ready_for_fork = false;
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
if (uffd_register(uffd, area_dst, nr_pages * page_size,
@@ -845,9 +844,8 @@ static void uffd_events_test_common(bool
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
- /* Wait for child thread to start before forking */
- pthread_barrier_wait(&ready_for_fork);
- pthread_barrier_destroy(&ready_for_fork);
+ while (!ready_for_fork)
+ ; /* Wait for the poll_thread to start executing before forking */
pid = fork();
if (pid < 0)
_
Patches currently in -mm which might be from edliaw(a)google.com are
revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
selftests-mm-fix-deadlock-for-fork-after-pthread_create-with-atomic_bool.patch
The patch titled
Subject: Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Edward Liaw <edliaw(a)google.com>
Subject: Revert "selftests/mm: fix deadlock for fork after pthread_create on ARM"
Date: Fri, 18 Oct 2024 17:17:22 +0000
Patch series "selftests/mm: revert pthread_barrier change"
On Android arm, pthread_create followed by a fork caused a deadlock in
the case where the fork required work to be completed by the created
thread.
The previous patches incorrectly assumed that the parent would
always initialize the pthread_barrier for the child thread. This
reverts the change and replaces the fix for wp-fork-with-event with the
original use of atomic_bool.
This patch (of 3):
This reverts commit e142cc87ac4ec618f2ccf5f68aedcd6e28a59d9d.
fork_event_consumer may be called by other tests that do not initialize
the pthread_barrier, so this approach is not correct. The subsequent
patch will revert to using atomic_bool instead.
Link: https://lkml.kernel.org/r/20241018171734.2315053-1-edliaw@google.com
Link: https://lkml.kernel.org/r/20241018171734.2315053-2-edliaw@google.com
Fixes: e142cc87ac4e ("fix deadlock for fork after pthread_create on ARM")
Signed-off-by: Edward Liaw <edliaw(a)google.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 7 -------
1 file changed, 7 deletions(-)
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -241,9 +241,6 @@ static void *fork_event_consumer(void *d
fork_event_args *args = data;
struct uffd_msg msg = { 0 };
- /* Ready for parent thread to fork */
- pthread_barrier_wait(&ready_for_fork);
-
/* Read until a full msg received */
while (uffd_read_msg(args->parent_uffd, &msg));
@@ -311,12 +308,8 @@ static int pagemap_test_fork(int uffd, b
/* Prepare a thread to resolve EVENT_FORK */
if (with_event) {
- pthread_barrier_init(&ready_for_fork, NULL, 2);
if (pthread_create(&thread, NULL, fork_event_consumer, &args))
err("pthread_create()");
- /* Wait for child thread to start before forking */
- pthread_barrier_wait(&ready_for_fork);
- pthread_barrier_destroy(&ready_for_fork);
}
child = fork();
_
Patches currently in -mm which might be from edliaw(a)google.com are
revert-selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
revert-selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
selftests-mm-fix-deadlock-for-fork-after-pthread_create-with-atomic_bool.patch
There is a race between laundromat handling of revoked delegations
and a client sending free_stateid operation. Laundromat thread
finds that delegation has expired and needs to be revoked so it
marks the delegation stid revoked and it puts it on a reaper list
but then it unlock the state lock and the actual delegation revocation
happens without the lock. Once the stid is marked revoked a racing
free_stateid processing thread does the following (1) it calls
list_del_init() which removes it from the reaper list and (2) frees
the delegation stid structure. The laundromat thread ends up not
calling the revoke_delegation() function for this particular delegation
but that means it will no release the lock lease that exists on
the file.
Now, a new open for this file comes in and ends up finding that
lease list isn't empty and calls nfsd_breaker_owns_lease() which ends
up trying to derefence a freed delegation stateid. Leading to the
followint use-after-free KASAN warning:
kernel: ==================================================================
kernel: BUG: KASAN: slab-use-after-free in nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: Read of size 8 at addr ffff0000e73cd0c8 by task nfsd/6205
kernel:
kernel: CPU: 2 UID: 0 PID: 6205 Comm: nfsd Kdump: loaded Not tainted 6.11.0-rc7+ #9
kernel: Hardware name: Apple Inc. Apple Virtualization Generic Platform, BIOS 2069.0.0.0.0 08/03/2024
kernel: Call trace:
kernel: dump_backtrace+0x98/0x120
kernel: show_stack+0x1c/0x30
kernel: dump_stack_lvl+0x80/0xe8
kernel: print_address_description.constprop.0+0x84/0x390
kernel: print_report+0xa4/0x268
kernel: kasan_report+0xb4/0xf8
kernel: __asan_report_load8_noabort+0x1c/0x28
kernel: nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: nfsd_file_do_acquire+0xb3c/0x11d0 [nfsd]
kernel: nfsd_file_acquire_opened+0x84/0x110 [nfsd]
kernel: nfs4_get_vfs_file+0x634/0x958 [nfsd]
kernel: nfsd4_process_open2+0xa40/0x1a40 [nfsd]
kernel: nfsd4_open+0xa08/0xe80 [nfsd]
kernel: nfsd4_proc_compound+0xb8c/0x2130 [nfsd]
kernel: nfsd_dispatch+0x22c/0x718 [nfsd]
kernel: svc_process_common+0x8e8/0x1960 [sunrpc]
kernel: svc_process+0x3d4/0x7e0 [sunrpc]
kernel: svc_handle_xprt+0x828/0xe10 [sunrpc]
kernel: svc_recv+0x2cc/0x6a8 [sunrpc]
kernel: nfsd+0x270/0x400 [nfsd]
kernel: kthread+0x288/0x310
kernel: ret_from_fork+0x10/0x20
This patch proposes a fixed that's based on adding 2 new additional
stid's sc_status values that help coordinate between the laundromat
and other operations (nfsd4_free_stateid() and nfsd4_delegreturn()).
First to make sure, that once the stid is marked revoked, it is not
removed by the nfsd4_free_stateid(), the laundromat take a reference
on the stateid. Then, coordinating whether the stid has been put
on the cl_revoked list or we are processing FREE_STATEID and need to
make sure to remove it from the list, each check that state and act
accordingly. If laundromat has added to the cl_revoke list before
the arrival of FREE_STATEID, then nfsd4_free_stateid() knows to remove
it from the list. If nfsd4_free_stateid() finds that operations arrived
before laundromat has placed it on cl_revoke list, it marks the state
freed and then laundromat will no longer add it to the list.
Also, for nfsd4_delegreturn() when looking for the specified stid,
we need to access stid that are marked removed or freeable, it means
the laundromat has started processing it but hasn't finished and this
delegreturn needs to return nfserr_deleg_revoked and not
nfserr_bad_stateid. The latter will not trigger a FREE_STATEID and the
lack of it will leave this stid on the cl_revoked list indefinitely.
Fixes: 2d4a532d385f ("nfsd: ensure that clp->cl_revoked list is
protected by clp->cl_lock")
CC: stable(a)vger.kernel.org
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
--- v3. (1) adds refcount to nfsd4_revoke_states() (2) adds comments
to revoke_delegation(), adds the WARN_ON_ONCE to make sure stid
state is what is expected and changes unlock placement.
---
fs/nfsd/nfs4state.c | 46 +++++++++++++++++++++++++++++++++++++--------
fs/nfsd/state.h | 2 ++
2 files changed, 40 insertions(+), 8 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7905ab9d8bc6..28e9b52b01fd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1351,21 +1351,47 @@ static void destroy_delegation(struct nfs4_delegation *dp)
destroy_unhashed_deleg(dp);
}
+/**
+ * revoke_delegation - perform nfs4 delegation structure cleanup
+ * @dp: pointer to the delegation
+ *
+ * This function assumes that it's called either from the administrative
+ * interface (nfsd4_revoke_states()) that's revoking a specific delegation
+ * stateid or it's called from a laundromat thread (nfsd4_landromat()) that
+ * determined that this specific state has expired and needs to be revoked
+ * (both mark state with the appropriate stid sc_status mode). It is also
+ * assumed that a reference was take on the @dp state.
+ *
+ * If this function finds that the @dp state is SC_STATUS_FREED it means
+ * that a FREE_STATEID operation for this stateid has been processed and
+ * we can proceed to removing it from recalled list. However, if @dp state
+ * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked
+ * list and wait for the FREE_STATEID to arrive from the client. At the same
+ * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the
+ * nfsd4_free_stateid() function that this stateid has already been added
+ * to the cl_revoked list and that nfsd4_free_stateid() is now responsible
+ * for removing it from the list. Inspection of where the delegation state
+ * in the revocation process is protected by the clp->cl_lock.
+ */
static void revoke_delegation(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_stid.sc_client;
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ WARN_ON_ONCE(!(dp->dl_stid.sc_status &
+ (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)));
trace_nfsd_stid_revoke(&dp->dl_stid);
- if (dp->dl_stid.sc_status &
- (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
- spin_lock(&clp->cl_lock);
- refcount_inc(&dp->dl_stid.sc_count);
- list_add(&dp->dl_recall_lru, &clp->cl_revoked);
- spin_unlock(&clp->cl_lock);
+ spin_lock(&clp->cl_lock);
+ if (dp->dl_stid.sc_status & SC_STATUS_FREED) {
+ list_del_init(&dp->dl_recall_lru);
+ goto out;
}
+ list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ dp->dl_stid.sc_status |= SC_STATUS_FREEABLE;
+out:
+ spin_unlock(&clp->cl_lock);
destroy_unhashed_deleg(dp);
}
@@ -1772,6 +1798,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
mutex_unlock(&stp->st_mutex);
break;
case SC_TYPE_DELEG:
+ refcount_inc(&stid->sc_count);
dp = delegstateid(stid);
spin_lock(&state_lock);
if (!unhash_delegation_locked(
@@ -6606,6 +6633,7 @@ nfs4_laundromat(struct nfsd_net *nn)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(<, dp->dl_time))
break;
+ refcount_inc(&dp->dl_stid.sc_count);
unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -7218,7 +7246,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
s->sc_status |= SC_STATUS_CLOSED;
spin_unlock(&s->sc_lock);
dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
+ if (s->sc_status & SC_STATUS_FREEABLE)
+ list_del_init(&dp->dl_recall_lru);
+ s->sc_status |= SC_STATUS_FREED;
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -7548,7 +7578,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED|SC_STATUS_FREEABLE, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 6351e6eca7cc..cc00d6b64b88 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -114,6 +114,8 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define SC_STATUS_REVOKED BIT(1)
#define SC_STATUS_ADMIN_REVOKED BIT(2)
+#define SC_STATUS_FREEABLE BIT(3)
+#define SC_STATUS_FREED BIT(4)
unsigned short sc_status;
struct list_head sc_cp_list;
--
2.43.5
There is a race between laundromat handling of revoked delegations
and a client sending free_stateid operation. Laundromat thread
finds that delegation has expired and needs to be revoked so it
marks the delegation stid revoked and it puts it on a reaper list
but then it unlock the state lock and the actual delegation revocation
happens without the lock. Once the stid is marked revoked a racing
free_stateid processing thread does the following (1) it calls
list_del_init() which removes it from the reaper list and (2) frees
the delegation stid structure. The laundromat thread ends up not
calling the revoke_delegation() function for this particular delegation
but that means it will no release the lock lease that exists on
the file.
Now, a new open for this file comes in and ends up finding that
lease list isn't empty and calls nfsd_breaker_owns_lease() which ends
up trying to derefence a freed delegation stateid. Leading to the
followint use-after-free KASAN warning:
kernel: ==================================================================
kernel: BUG: KASAN: slab-use-after-free in nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: Read of size 8 at addr ffff0000e73cd0c8 by task nfsd/6205
kernel:
kernel: CPU: 2 UID: 0 PID: 6205 Comm: nfsd Kdump: loaded Not tainted 6.11.0-rc7+ #9
kernel: Hardware name: Apple Inc. Apple Virtualization Generic Platform, BIOS 2069.0.0.0.0 08/03/2024
kernel: Call trace:
kernel: dump_backtrace+0x98/0x120
kernel: show_stack+0x1c/0x30
kernel: dump_stack_lvl+0x80/0xe8
kernel: print_address_description.constprop.0+0x84/0x390
kernel: print_report+0xa4/0x268
kernel: kasan_report+0xb4/0xf8
kernel: __asan_report_load8_noabort+0x1c/0x28
kernel: nfsd_breaker_owns_lease+0x140/0x160 [nfsd]
kernel: leases_conflict+0x68/0x370
kernel: __break_lease+0x204/0xc38
kernel: nfsd_open_break_lease+0x8c/0xf0 [nfsd]
kernel: nfsd_file_do_acquire+0xb3c/0x11d0 [nfsd]
kernel: nfsd_file_acquire_opened+0x84/0x110 [nfsd]
kernel: nfs4_get_vfs_file+0x634/0x958 [nfsd]
kernel: nfsd4_process_open2+0xa40/0x1a40 [nfsd]
kernel: nfsd4_open+0xa08/0xe80 [nfsd]
kernel: nfsd4_proc_compound+0xb8c/0x2130 [nfsd]
kernel: nfsd_dispatch+0x22c/0x718 [nfsd]
kernel: svc_process_common+0x8e8/0x1960 [sunrpc]
kernel: svc_process+0x3d4/0x7e0 [sunrpc]
kernel: svc_handle_xprt+0x828/0xe10 [sunrpc]
kernel: svc_recv+0x2cc/0x6a8 [sunrpc]
kernel: nfsd+0x270/0x400 [nfsd]
kernel: kthread+0x288/0x310
kernel: ret_from_fork+0x10/0x20
This patch proposes a fix that's based on adding 2 new additional
stid's sc_status values that help coordinate between the laundromat
and other operations (nfsd4_free_stateid() and nfsd4_delegreturn()).
First to make sure, that once the stid is marked revoked, it is not
removed by the nfsd4_free_stateid(), the laundromat take a reference
on the stateid. Then, coordinating whether the stid has been put
on the cl_revoked list or we are processing FREE_STATEID and need to
make sure to remove it from the list, each check that state and act
accordingly. If laundromat has added to the cl_revoke list before
the arrival of FREE_STATEID, then nfsd4_free_stateid() knows to remove
it from the list. If nfsd4_free_stateid() finds that operations arrived
before laundromat has placed it on cl_revoke list, it marks the state
freed and then laundromat will no longer add it to the list.
Also, for nfsd4_delegreturn() when looking for the specified stid,
we need to access stid that are marked removed or freeable, it means
the laundromat has started processing it but hasn't finished and this
delegreturn needs to return nfserr_deleg_revoked and not
nfserr_bad_stateid. The latter will not trigger a FREE_STATEID and the
lack of it will leave this stid on the cl_revoked list indefinitely.
Fixes: 2d4a532d385f ("nfsd: ensure that clp->cl_revoked list is
protected by clp->cl_lock")
CC: stable(a)vger.kernel.org
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
---
fs/nfsd/nfs4state.c | 15 ++++++++++++---
fs/nfsd/state.h | 2 ++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ac1859c7cc9d..cb989802e896 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1370,10 +1370,16 @@ static void revoke_delegation(struct nfs4_delegation *dp)
if (dp->dl_stid.sc_status &
(SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
spin_lock(&clp->cl_lock);
- refcount_inc(&dp->dl_stid.sc_count);
+ if (dp->dl_stid.sc_status & SC_STATUS_FREED) {
+ list_del_init(&dp->dl_recall_lru);
+ spin_unlock(&clp->cl_lock);
+ goto out;
+ }
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ dp->dl_stid.sc_status |= SC_STATUS_FREEABLE;
spin_unlock(&clp->cl_lock);
}
+out:
destroy_unhashed_deleg(dp);
}
@@ -6545,6 +6551,7 @@ nfs4_laundromat(struct nfsd_net *nn)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(<, dp->dl_time))
break;
+ refcount_inc(&dp->dl_stid.sc_count);
unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -7156,7 +7163,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (s->sc_status & SC_STATUS_REVOKED) {
spin_unlock(&s->sc_lock);
dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
+ if (s->sc_status & SC_STATUS_FREEABLE)
+ list_del_init(&dp->dl_recall_lru);
+ s->sc_status |= SC_STATUS_FREED;
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -7486,7 +7495,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED|SC_STATUS_FREEABLE, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 79c743c01a47..35b3564c065f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -114,6 +114,8 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define SC_STATUS_REVOKED BIT(1)
#define SC_STATUS_ADMIN_REVOKED BIT(2)
+#define SC_STATUS_FREEABLE BIT(3)
+#define SC_STATUS_FREED BIT(4)
unsigned short sc_status;
struct list_head sc_cp_list;
--
2.43.5
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Commit ea7e2d5e49c0 ("mm: call the security_mmap_file() LSM hook in
remap_file_pages()") fixed a security issue, it added an LSM check when
trying to remap file pages, so that LSMs have the opportunity to evaluate
such action like for other memory operations such as mmap() and mprotect().
However, that commit called security_mmap_file() inside the mmap_lock lock,
while the other calls do it before taking the lock, after commit
8b3ec6814c83 ("take security_mmap_file() outside of ->mmap_sem").
This caused lock inversion issue with IMA which was taking the mmap_lock
and i_mutex lock in the opposite way when the remap_file_pages() system
call was called.
Solve the issue by splitting the critical region in remap_file_pages() in
two regions: the first takes a read lock of mmap_lock and retrieves the VMA
and the file associated, and calculate the 'prot' and 'flags' variable; the
second takes a write lock on mmap_lock, checks that the VMA flags and the
VMA file descriptor are the same as the ones obtained in the first critical
region (otherwise the system call fails), and calls do_mmap().
In between, after releasing the read lock and taking the write lock, call
security_mmap_file(), and solve the lock inversion issue.
Cc: stable(a)vger.kernel.org
Fixes: ea7e2d5e49c0 ("mm: call the security_mmap_file() LSM hook in remap_file_pages()")
Reported-by: syzbot+91ae49e1c1a2634d20c0(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/linux-security-module/66f7b10e.050a0220.46d20.0036.…
Reviewed-by: Roberto Sassu <roberto.sassu(a)huawei.com> (Calculate prot and flags earlier)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
---
mm/mmap.c | 62 ++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 45 insertions(+), 17 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c0fb43064b5..762944427e03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,6 +1640,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long populate = 0;
unsigned long ret = -EINVAL;
struct file *file;
+ vm_flags_t vm_flags;
pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
current->comm, current->pid);
@@ -1656,12 +1657,53 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
- if (mmap_write_lock_killable(mm))
+ if (mmap_read_lock_killable(mm))
+ return -EINTR;
+
+ vma = vma_lookup(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED)) {
+ mmap_read_unlock(mm);
+ return -EINVAL;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+
+ /* Save vm_flags used to calculate prot and flags, and recheck later. */
+ vm_flags = vma->vm_flags;
+ file = get_file(vma->vm_file);
+
+ mmap_read_unlock(mm);
+
+ ret = security_mmap_file(file, prot, flags);
+ if (ret) {
+ fput(file);
+ return ret;
+ }
+
+ ret = -EINVAL;
+
+ if (mmap_write_lock_killable(mm)) {
+ fput(file);
return -EINTR;
+ }
vma = vma_lookup(mm, start);
- if (!vma || !(vma->vm_flags & VM_SHARED))
+ if (!vma)
+ goto out;
+
+ if (vma->vm_flags != vm_flags)
+ goto out;
+
+ if (vma->vm_file != file)
goto out;
if (start + size > vma->vm_end) {
@@ -1689,25 +1731,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
}
- prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
- prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
- prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
-
- flags &= MAP_NONBLOCK;
- flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
-
- file = get_file(vma->vm_file);
- ret = security_mmap_file(vma->vm_file, prot, flags);
- if (ret)
- goto out_fput;
ret = do_mmap(vma->vm_file, start, size,
prot, flags, 0, pgoff, &populate, NULL);
-out_fput:
- fput(file);
out:
mmap_write_unlock(mm);
+ fput(file);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
--
2.34.1
If some remap_pfn_range() calls succeeded before one failed, we still have
buffer pages mapped into the userspace page tables when we drop the buffer
reference with comedi_buf_map_put(bm). The userspace mappings are only
cleaned up later in the mmap error path.
Fix it by explicitly flushing all mappings in our VMA on the error path.
See commit 79a61cc3fc04 ("mm: avoid leaving partial pfn mappings around in
error case").
Cc: stable(a)vger.kernel.org
Fixes: ed9eccbe8970 ("Staging: add comedi core")
Signed-off-by: Jann Horn <jannh(a)google.com>
---
Note: compile-tested only; I don't actually have comedi hardware, and I
don't know anything about comedi.
---
Changes in v3:
- gate zapping ptes on CONFIG_MMU (Intel kernel test robot)
- Link to v2: https://lore.kernel.org/r/20241015-comedi-tlb-v2-1-cafb0e27dd9a@google.com
Changes in v2:
- only do the zapping in the pfnmap path (Ian Abbott)
- use zap_vma_ptes() instead of zap_page_range_single() (Ian Abbott)
- Link to v1: https://lore.kernel.org/r/20241014-comedi-tlb-v1-1-4b699144b438@google.com
---
drivers/comedi/comedi_fops.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index 1b481731df96..b9df9b19d4bd 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -2407,6 +2407,18 @@ static int comedi_mmap(struct file *file, struct vm_area_struct *vma)
start += PAGE_SIZE;
}
+
+#ifdef CONFIG_MMU
+ /*
+ * Leaving behind a partial mapping of a buffer we're about to
+ * drop is unsafe, see remap_pfn_range_notrack().
+ * We need to zap the range here ourselves instead of relying
+ * on the automatic zapping in remap_pfn_range() because we call
+ * remap_pfn_range() in a loop.
+ */
+ if (retval)
+ zap_vma_ptes(vma, vma->vm_start, size);
+#endif
}
if (retval == 0) {
---
base-commit: 6485cf5ea253d40d507cd71253c9568c5470cd27
change-id: 20241014-comedi-tlb-400246505961
--
Jann Horn <jannh(a)google.com>
In psnet_open_pf_bar() and snet_open_vf_bar() a string later passed to
pcim_iomap_regions() is placed on the stack. Neither
pcim_iomap_regions() nor the functions it calls copy that string.
Should the string later ever be used, this, consequently, causes
undefined behavior since the stack frame will by then have disappeared.
Fix the bug by allocating the strings on the heap through
devm_kasprintf().
Cc: stable(a)vger.kernel.org # v6.3
Fixes: 51a8f9d7f587 ("virtio: vdpa: new SolidNET DPU driver.")
Reported-by: Christophe JAILLET <christophe.jaillet(a)wanadoo.fr>
Closes: https://lore.kernel.org/all/74e9109a-ac59-49e2-9b1d-d825c9c9f891@wanadoo.fr/
Suggested-by: Andy Shevchenko <andy(a)kernel.org>
Signed-off-by: Philipp Stanner <pstanner(a)redhat.com>
---
drivers/vdpa/solidrun/snet_main.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c
index 99428a04068d..c8b74980dbd1 100644
--- a/drivers/vdpa/solidrun/snet_main.c
+++ b/drivers/vdpa/solidrun/snet_main.c
@@ -555,7 +555,7 @@ static const struct vdpa_config_ops snet_config_ops = {
static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
{
- char name[50];
+ char *name;
int ret, i, mask = 0;
/* We don't know which BAR will be used to communicate..
* We will map every bar with len > 0.
@@ -573,7 +573,10 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
return -ENODEV;
}
- snprintf(name, sizeof(name), "psnet[%s]-bars", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
ret = pcim_iomap_regions(pdev, mask, name);
if (ret) {
SNET_ERR(pdev, "Failed to request and map PCI BARs\n");
@@ -590,10 +593,13 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet)
{
- char name[50];
+ char *name;
int ret;
- snprintf(name, sizeof(name), "snet[%s]-bar", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "snet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
/* Request and map BAR */
ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name);
if (ret) {
--
2.46.1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x e8061f06185be0a06a73760d6526b8b0feadfe52
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101827-implosion-twilight-c8e1@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e8061f06185be0a06a73760d6526b8b0feadfe52 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Tue, 17 Sep 2024 17:18:33 +0200
Subject: [PATCH] KVM: s390: gaccess: Check if guest address is in memslot
Previously, access_guest_page() did not check whether the given guest
address is inside of a memslot. This is not a problem, since
kvm_write_guest_page/kvm_read_guest_page return -EFAULT in this case.
However, -EFAULT is also returned when copy_to/from_user fails.
When emulating a guest instruction, the address being outside a memslot
usually means that an addressing exception should be injected into the
guest.
Failure in copy_to/from_user however indicates that something is wrong
in userspace and hence should be handled there.
To be able to distinguish these two cases, return PGM_ADDRESSING in
access_guest_page() when the guest address is outside guest memory. In
access_guest_real(), populate vcpu->arch.pgm.code such that
kvm_s390_inject_prog_cond() can be used in the caller for injecting into
the guest (if applicable).
Since this adds a new return value to access_guest_page(), we need to make
sure that other callers are not confused by the new positive return value.
There are the following users of access_guest_page():
- access_guest_with_key() does the checking itself (in
guest_range_to_gpas()), so this case should never happen. Even if, the
handling is set up properly.
- access_guest_real() just passes the return code to its callers, which
are:
- read_guest_real() - see below
- write_guest_real() - see below
There are the following users of read_guest_real():
- ar_translation() in gaccess.c which already returns PGM_*
- setup_apcb10(), setup_apcb00(), setup_apcb11() in vsie.c which always
return -EFAULT on read_guest_read() nonzero return - no change
- shadow_crycb(), handle_stfle() always present this as validity, this
could be handled better but doesn't change current behaviour - no change
There are the following users of write_guest_real():
- kvm_s390_store_status_unloaded() always returns -EFAULT on
write_guest_real() failure.
Fixes: 2293897805c2 ("KVM: s390: add architecture compliant guest access functions")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20240917151904.74314-2-nrb@linux.ibm.com
Acked-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e65f597e3044..a688351f4ab5 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -828,6 +828,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa);
int rc;
+ if (!gfn_to_memslot(kvm, gfn))
+ return PGM_ADDRESSING;
if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else
@@ -985,6 +987,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len;
data += fragment_len;
}
+ if (rc > 0)
+ vcpu->arch.pgm.code = rc;
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index b320d12aa049..3fde45a151f2 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest low address and key protection are not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying from @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to guest memory.
*/
@@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest key protection is not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying to @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to kernel space.
*/
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x e8061f06185be0a06a73760d6526b8b0feadfe52
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101826-gracious-singer-816f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e8061f06185be0a06a73760d6526b8b0feadfe52 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Tue, 17 Sep 2024 17:18:33 +0200
Subject: [PATCH] KVM: s390: gaccess: Check if guest address is in memslot
Previously, access_guest_page() did not check whether the given guest
address is inside of a memslot. This is not a problem, since
kvm_write_guest_page/kvm_read_guest_page return -EFAULT in this case.
However, -EFAULT is also returned when copy_to/from_user fails.
When emulating a guest instruction, the address being outside a memslot
usually means that an addressing exception should be injected into the
guest.
Failure in copy_to/from_user however indicates that something is wrong
in userspace and hence should be handled there.
To be able to distinguish these two cases, return PGM_ADDRESSING in
access_guest_page() when the guest address is outside guest memory. In
access_guest_real(), populate vcpu->arch.pgm.code such that
kvm_s390_inject_prog_cond() can be used in the caller for injecting into
the guest (if applicable).
Since this adds a new return value to access_guest_page(), we need to make
sure that other callers are not confused by the new positive return value.
There are the following users of access_guest_page():
- access_guest_with_key() does the checking itself (in
guest_range_to_gpas()), so this case should never happen. Even if, the
handling is set up properly.
- access_guest_real() just passes the return code to its callers, which
are:
- read_guest_real() - see below
- write_guest_real() - see below
There are the following users of read_guest_real():
- ar_translation() in gaccess.c which already returns PGM_*
- setup_apcb10(), setup_apcb00(), setup_apcb11() in vsie.c which always
return -EFAULT on read_guest_read() nonzero return - no change
- shadow_crycb(), handle_stfle() always present this as validity, this
could be handled better but doesn't change current behaviour - no change
There are the following users of write_guest_real():
- kvm_s390_store_status_unloaded() always returns -EFAULT on
write_guest_real() failure.
Fixes: 2293897805c2 ("KVM: s390: add architecture compliant guest access functions")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20240917151904.74314-2-nrb@linux.ibm.com
Acked-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e65f597e3044..a688351f4ab5 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -828,6 +828,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa);
int rc;
+ if (!gfn_to_memslot(kvm, gfn))
+ return PGM_ADDRESSING;
if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else
@@ -985,6 +987,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len;
data += fragment_len;
}
+ if (rc > 0)
+ vcpu->arch.pgm.code = rc;
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index b320d12aa049..3fde45a151f2 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest low address and key protection are not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying from @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to guest memory.
*/
@@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest key protection is not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying to @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to kernel space.
*/
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x e8061f06185be0a06a73760d6526b8b0feadfe52
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101824-departure-oversight-aa1e@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e8061f06185be0a06a73760d6526b8b0feadfe52 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Tue, 17 Sep 2024 17:18:33 +0200
Subject: [PATCH] KVM: s390: gaccess: Check if guest address is in memslot
Previously, access_guest_page() did not check whether the given guest
address is inside of a memslot. This is not a problem, since
kvm_write_guest_page/kvm_read_guest_page return -EFAULT in this case.
However, -EFAULT is also returned when copy_to/from_user fails.
When emulating a guest instruction, the address being outside a memslot
usually means that an addressing exception should be injected into the
guest.
Failure in copy_to/from_user however indicates that something is wrong
in userspace and hence should be handled there.
To be able to distinguish these two cases, return PGM_ADDRESSING in
access_guest_page() when the guest address is outside guest memory. In
access_guest_real(), populate vcpu->arch.pgm.code such that
kvm_s390_inject_prog_cond() can be used in the caller for injecting into
the guest (if applicable).
Since this adds a new return value to access_guest_page(), we need to make
sure that other callers are not confused by the new positive return value.
There are the following users of access_guest_page():
- access_guest_with_key() does the checking itself (in
guest_range_to_gpas()), so this case should never happen. Even if, the
handling is set up properly.
- access_guest_real() just passes the return code to its callers, which
are:
- read_guest_real() - see below
- write_guest_real() - see below
There are the following users of read_guest_real():
- ar_translation() in gaccess.c which already returns PGM_*
- setup_apcb10(), setup_apcb00(), setup_apcb11() in vsie.c which always
return -EFAULT on read_guest_read() nonzero return - no change
- shadow_crycb(), handle_stfle() always present this as validity, this
could be handled better but doesn't change current behaviour - no change
There are the following users of write_guest_real():
- kvm_s390_store_status_unloaded() always returns -EFAULT on
write_guest_real() failure.
Fixes: 2293897805c2 ("KVM: s390: add architecture compliant guest access functions")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20240917151904.74314-2-nrb@linux.ibm.com
Acked-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e65f597e3044..a688351f4ab5 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -828,6 +828,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa);
int rc;
+ if (!gfn_to_memslot(kvm, gfn))
+ return PGM_ADDRESSING;
if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else
@@ -985,6 +987,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len;
data += fragment_len;
}
+ if (rc > 0)
+ vcpu->arch.pgm.code = rc;
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index b320d12aa049..3fde45a151f2 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest low address and key protection are not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying from @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to guest memory.
*/
@@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest key protection is not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying to @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to kernel space.
*/
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x e8061f06185be0a06a73760d6526b8b0feadfe52
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101823-tractor-twitter-a318@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e8061f06185be0a06a73760d6526b8b0feadfe52 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Tue, 17 Sep 2024 17:18:33 +0200
Subject: [PATCH] KVM: s390: gaccess: Check if guest address is in memslot
Previously, access_guest_page() did not check whether the given guest
address is inside of a memslot. This is not a problem, since
kvm_write_guest_page/kvm_read_guest_page return -EFAULT in this case.
However, -EFAULT is also returned when copy_to/from_user fails.
When emulating a guest instruction, the address being outside a memslot
usually means that an addressing exception should be injected into the
guest.
Failure in copy_to/from_user however indicates that something is wrong
in userspace and hence should be handled there.
To be able to distinguish these two cases, return PGM_ADDRESSING in
access_guest_page() when the guest address is outside guest memory. In
access_guest_real(), populate vcpu->arch.pgm.code such that
kvm_s390_inject_prog_cond() can be used in the caller for injecting into
the guest (if applicable).
Since this adds a new return value to access_guest_page(), we need to make
sure that other callers are not confused by the new positive return value.
There are the following users of access_guest_page():
- access_guest_with_key() does the checking itself (in
guest_range_to_gpas()), so this case should never happen. Even if, the
handling is set up properly.
- access_guest_real() just passes the return code to its callers, which
are:
- read_guest_real() - see below
- write_guest_real() - see below
There are the following users of read_guest_real():
- ar_translation() in gaccess.c which already returns PGM_*
- setup_apcb10(), setup_apcb00(), setup_apcb11() in vsie.c which always
return -EFAULT on read_guest_read() nonzero return - no change
- shadow_crycb(), handle_stfle() always present this as validity, this
could be handled better but doesn't change current behaviour - no change
There are the following users of write_guest_real():
- kvm_s390_store_status_unloaded() always returns -EFAULT on
write_guest_real() failure.
Fixes: 2293897805c2 ("KVM: s390: add architecture compliant guest access functions")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Heiko Carstens <hca(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20240917151904.74314-2-nrb@linux.ibm.com
Acked-by: Janosch Frank <frankja(a)linux.ibm.com>
Signed-off-by: Heiko Carstens <hca(a)linux.ibm.com>
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e65f597e3044..a688351f4ab5 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -828,6 +828,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa);
int rc;
+ if (!gfn_to_memslot(kvm, gfn))
+ return PGM_ADDRESSING;
if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else
@@ -985,6 +987,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len;
data += fragment_len;
}
+ if (rc > 0)
+ vcpu->arch.pgm.code = rc;
return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index b320d12aa049..3fde45a151f2 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest low address and key protection are not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying from @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to guest memory.
*/
@@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest key protection is not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying to @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to kernel space.
*/
Currently, the rproc "atomic_t power" variable is incremented during:
a. WPSS rproc auto boot.
b. AHB power on for ath11k.
During AHB power off (rmmod ath11k_ahb.ko), rproc_shutdown fails
to unload the WPSS firmware because the rproc->power value is '2',
causing the atomic_dec_and_test(&rproc->power) condition to fail.
Consequently, during AHB power on (insmod ath11k_ahb.ko),
QMI_WLANFW_HOST_CAP_REQ_V01 fails due to the host and firmware QMI
states being out of sync.
Fixes: 300ed425dfa9 ("remoteproc: qcom_q6v5_pas: Add SC7280 ADSP, CDSP & WPSS")
Cc: stable(a)vger.kernel.org
Signed-off-by: Balaji Pothunoori <quic_bpothuno(a)quicinc.com>
---
v2: updated commit text.
added Fixes/cc:stable tags.
drivers/remoteproc/qcom_q6v5_pas.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c
index ef82835e98a4..05963d7924df 100644
--- a/drivers/remoteproc/qcom_q6v5_pas.c
+++ b/drivers/remoteproc/qcom_q6v5_pas.c
@@ -1344,7 +1344,7 @@ static const struct adsp_data sc7280_wpss_resource = {
.crash_reason_smem = 626,
.firmware_name = "wpss.mdt",
.pas_id = 6,
- .auto_boot = true,
+ .auto_boot = false,
.proxy_pd_names = (char*[]){
"cx",
"mx",
--
2.34.1
commit f011c9cf04c06f16b24f583d313d3c012e589e50 upstream.
The submit queue polling threads are userland threads that just never
exit to the userland. When creating the thread with IORING_SETUP_SQ_AFF,
the affinity of the poller thread is set to the cpu specified in
sq_thread_cpu. However, this CPU can be outside of the cpuset defined
by the cgroup cpuset controller. This violates the rules defined by the
cpuset controller and is a potential issue for realtime applications.
In b7ed6d8ffd6 we fixed the default affinity of the poller thread, in
case no explicit pinning is required by inheriting the one of the
creating task. In case of explicit pinning, the check is more
complicated, as also a cpu outside of the parent cpumask is allowed.
We implemented this by using cpuset_cpus_allowed (that has support for
cgroup cpusets) and testing if the requested cpu is in the set.
Fixes: 37d1e2e3642e ("io_uring: move SQPOLL thread io-wq forked worker")
Signed-off-by: Felix Moessbauer <felix.moessbauer(a)siemens.com>
Link: https://lore.kernel.org/r/20240909150036.55921-1-felix.moessbauer@siemens.c…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
io_uring/io_uring.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8ed2c65529714..6b6fd244233f8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -56,6 +56,7 @@
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
+#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
@@ -8746,10 +8747,12 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
return 0;
if (p->flags & IORING_SETUP_SQ_AFF) {
+ struct cpumask allowed_mask;
int cpu = p->sq_thread_cpu;
ret = -EINVAL;
- if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ cpuset_cpus_allowed(current, &allowed_mask);
+ if (!cpumask_test_cpu(cpu, &allowed_mask))
goto err_sqpoll;
sqd->sq_cpu = cpu;
} else {
--
2.39.5
Svacer reports a NULL-pointer dereference in rtl8xxxu_probe().
After having been compared to a NULL value, pointer hw is passed as
1st parameter in call to ieee80211_free_hw(), where it is dereferenced.
The problem is present in 5.10 stable release and can be fixed by the
following upstream patch that can be cleanly applied to 5.10 branch.
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x bea07fd63192b61209d48cbb81ef474cc3ee4c62
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101840-army-handstand-92f8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bea07fd63192b61209d48cbb81ef474cc3ee4c62 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Mon, 7 Oct 2024 16:28:32 +0100
Subject: [PATCH] maple_tree: correct tree corruption on spanning store
Patch series "maple_tree: correct tree corruption on spanning store", v3.
There has been a nasty yet subtle maple tree corruption bug that appears
to have been in existence since the inception of the algorithm.
This bug seems far more likely to happen since commit f8d112a4e657
("mm/mmap: avoid zeroing vma tree in mmap_region()"), which is the point
at which reports started to be submitted concerning this bug.
We were made definitely aware of the bug thanks to the kind efforts of
Bert Karwatzki who helped enormously in my being able to track this down
and identify the cause of it.
The bug arises when an attempt is made to perform a spanning store across
two leaf nodes, where the right leaf node is the rightmost child of the
shared parent, AND the store completely consumes the right-mode node.
This results in mas_wr_spanning_store() mitakenly duplicating the new and
existing entries at the maximum pivot within the range, and thus maple
tree corruption.
The fix patch corrects this by detecting this scenario and disallowing the
mistaken duplicate copy.
The fix patch commit message goes into great detail as to how this occurs.
This series also includes a test which reliably reproduces the issue, and
asserts that the fix works correctly.
Bert has kindly tested the fix and confirmed it resolved his issues. Also
Mikhail Gavrilov kindly reported what appears to be precisely the same
bug, which this fix should also resolve.
This patch (of 2):
There has been a subtle bug present in the maple tree implementation from
its inception.
This arises from how stores are performed - when a store occurs, it will
overwrite overlapping ranges and adjust the tree as necessary to
accommodate this.
A range may always ultimately span two leaf nodes. In this instance we
walk the two leaf nodes, determine which elements are not overwritten to
the left and to the right of the start and end of the ranges respectively
and then rebalance the tree to contain these entries and the newly
inserted one.
This kind of store is dubbed a 'spanning store' and is implemented by
mas_wr_spanning_store().
In order to reach this stage, mas_store_gfp() invokes
mas_wr_preallocate(), mas_wr_store_type() and mas_wr_walk() in turn to
walk the tree and update the object (mas) to traverse to the location
where the write should be performed, determining its store type.
When a spanning store is required, this function returns false stopping at
the parent node which contains the target range, and mas_wr_store_type()
marks the mas->store_type as wr_spanning_store to denote this fact.
When we go to perform the store in mas_wr_spanning_store(), we first
determine the elements AFTER the END of the range we wish to store (that
is, to the right of the entry to be inserted) - we do this by walking to
the NEXT pivot in the tree (i.e. r_mas.last + 1), starting at the node we
have just determined contains the range over which we intend to write.
We then turn our attention to the entries to the left of the entry we are
inserting, whose state is represented by l_mas, and copy these into a 'big
node', which is a special node which contains enough slots to contain two
leaf node's worth of data.
We then copy the entry we wish to store immediately after this - the copy
and the insertion of the new entry is performed by mas_store_b_node().
After this we copy the elements to the right of the end of the range which
we are inserting, if we have not exceeded the length of the node (i.e.
r_mas.offset <= r_mas.end).
Herein lies the bug - under very specific circumstances, this logic can
break and corrupt the maple tree.
Consider the following tree:
Height
0 Root Node
/ \
pivot = 0xffff / \ pivot = ULONG_MAX
/ \
1 A [-----] ...
/ \
pivot = 0x4fff / \ pivot = 0xffff
/ \
2 (LEAVES) B [-----] [-----] C
^--- Last pivot 0xffff.
Now imagine we wish to store an entry in the range [0x4000, 0xffff] (note
that all ranges expressed in maple tree code are inclusive):
1. mas_store_gfp() descends the tree, finds node A at <=0xffff, then
determines that this is a spanning store across nodes B and C. The mas
state is set such that the current node from which we traverse further
is node A.
2. In mas_wr_spanning_store() we try to find elements to the right of pivot
0xffff by searching for an index of 0x10000:
- mas_wr_walk_index() invokes mas_wr_walk_descend() and
mas_wr_node_walk() in turn.
- mas_wr_node_walk() loops over entries in node A until EITHER it
finds an entry whose pivot equals or exceeds 0x10000 OR it
reaches the final entry.
- Since no entry has a pivot equal to or exceeding 0x10000, pivot
0xffff is selected, leading to node C.
- mas_wr_walk_traverse() resets the mas state to traverse node C. We
loop around and invoke mas_wr_walk_descend() and mas_wr_node_walk()
in turn once again.
- Again, we reach the last entry in node C, which has a pivot of
0xffff.
3. We then copy the elements to the left of 0x4000 in node B to the big
node via mas_store_b_node(), and insert the new [0x4000, 0xffff] entry
too.
4. We determine whether we have any entries to copy from the right of the
end of the range via - and with r_mas set up at the entry at pivot
0xffff, r_mas.offset <= r_mas.end, and then we DUPLICATE the entry at
pivot 0xffff.
5. BUG! The maple tree is corrupted with a duplicate entry.
This requires a very specific set of circumstances - we must be spanning
the last element in a leaf node, which is the last element in the parent
node.
spanning store across two leaf nodes with a range that ends at that shared
pivot.
A potential solution to this problem would simply be to reset the walk
each time we traverse r_mas, however given the rarity of this situation it
seems that would be rather inefficient.
Instead, this patch detects if the right hand node is populated, i.e. has
anything we need to copy.
We do so by only copying elements from the right of the entry being
inserted when the maximum value present exceeds the last, rather than
basing this on offset position.
The patch also updates some comments and eliminates the unused bool return
value in mas_wr_walk_index().
The work performed in commit f8d112a4e657 ("mm/mmap: avoid zeroing vma
tree in mmap_region()") seems to have made the probability of this event
much more likely, which is the point at which reports started to be
submitted concerning this bug.
The motivation for this change arose from Bert Karwatzki's report of
encountering mm instability after the release of kernel v6.12-rc1 which,
after the use of CONFIG_DEBUG_VM_MAPLE_TREE and similar configuration
options, was identified as maple tree corruption.
After Bert very generously provided his time and ability to reproduce this
event consistently, I was able to finally identify that the issue
discussed in this commit message was occurring for him.
Link: https://lkml.kernel.org/r/cover.1728314402.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/48b349a2a0f7c76e18772712d0997a5e12ab0a3b.17283144…
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Bert Karwatzki <spasswolf(a)web.de>
Closes: https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/
Tested-by: Bert Karwatzki <spasswolf(a)web.de>
Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
Closes: https://lore.kernel.org/all/CABXGCsOPwuoNOqSMmAvWO2Fz4TEmPnjFj-b7iF+XFRu1h7…
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
Reviewed-by: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Sidhartha Kumar <sidhartha.kumar(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index ce7c7a7a8258..3619301dda2e 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2196,6 +2196,8 @@ static inline void mas_node_or_none(struct ma_state *mas,
/*
* mas_wr_node_walk() - Find the correct offset for the index in the @mas.
+ * If @mas->index cannot be found within the containing
+ * node, we traverse to the last entry in the node.
* @wr_mas: The maple write state
*
* Uses mas_slot_locked() and does not need to worry about dead nodes.
@@ -3532,7 +3534,7 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
return true;
}
-static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
+static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
@@ -3541,11 +3543,9 @@ static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
mas->offset);
if (ma_is_leaf(wr_mas->type))
- return true;
+ return;
mas_wr_walk_traverse(wr_mas);
-
}
- return true;
}
/*
* mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
@@ -3765,8 +3765,8 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
memset(&b_node, 0, sizeof(struct maple_big_node));
/* Copy l_mas and store the value in b_node. */
mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
- /* Copy r_mas into b_node. */
- if (r_mas.offset <= r_mas.end)
+ /* Copy r_mas into b_node if there is anything to copy. */
+ if (r_mas.max > r_mas.last)
mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
&b_node, b_node.b_end + 1);
else
Avoid xHC host from processing a cancelled URB by always turning
cancelled URB TDs into no-op TRBs before queuing a 'Set TR Deq' command.
If the command fails then xHC will start processing the cancelled TD
instead of skipping it once endpoint is restarted, causing issues like
Babble error.
This is not a complete solution as a failed 'Set TR Deq' command does not
guarantee xHC TRB caches are cleared.
Fixes: 4db356924a50 ("xhci: turn cancelled td cleanup to its own function")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-ring.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 4d664ba53fe9..7dedf31bbddd 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -1023,7 +1023,7 @@ static int xhci_invalidate_cancelled_tds(struct xhci_virt_ep *ep)
td_to_noop(xhci, ring, cached_td, false);
cached_td->cancel_status = TD_CLEARED;
}
-
+ td_to_noop(xhci, ring, td, false);
td->cancel_status = TD_CLEARING_CACHE;
cached_td = td;
break;
--
2.25.1
The PLL checks are comparing 64 bit integers with 32 bit
ones, as reported by Coverity. Depending on the values of
the variables, this may underflow.
Fix it ensuring that both sides of the expression are u64.
Fixes: 852b50aeed15 ("media: On Semi AR0521 sensor driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
---
drivers/media/i2c/ar0521.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/media/i2c/ar0521.c b/drivers/media/i2c/ar0521.c
index fc27238dd4d3..24873149096c 100644
--- a/drivers/media/i2c/ar0521.c
+++ b/drivers/media/i2c/ar0521.c
@@ -255,10 +255,10 @@ static u32 calc_pll(struct ar0521_dev *sensor, u32 freq, u16 *pre_ptr, u16 *mult
continue; /* Minimum value */
if (new_mult > 254)
break; /* Maximum, larger pre won't work either */
- if (sensor->extclk_freq * (u64)new_mult < AR0521_PLL_MIN *
+ if (sensor->extclk_freq * (u64)new_mult < (u64)AR0521_PLL_MIN *
new_pre)
continue;
- if (sensor->extclk_freq * (u64)new_mult > AR0521_PLL_MAX *
+ if (sensor->extclk_freq * (u64)new_mult > (u64)AR0521_PLL_MAX *
new_pre)
break; /* Larger pre won't work either */
new_pll = div64_round_up(sensor->extclk_freq * (u64)new_mult,
--
2.47.0
Svacer reports possible dereference of a NULL-pointer in
amd_iommu_probe_finalize(). The problem is present in 5.10 stable release
and can be fixed by the following upstream patch. In order to apply this
patch, the incoming changes had to be manually accepted. This action was
necessary due to some differences in the code of amd_iommu_probe_finalize()
of the upstream version and 5.10 version of the kernel.
A commit adding back the stopping of tx on port shutdown failed to add
back the locking which had also been removed by commit e83766334f96
("tty: serial: qcom_geni_serial: No need to stop tx/rx on UART
shutdown").
Holding the port lock is needed to serialise against the console code,
which may update the interrupt enable register and access the port
state.
Fixes: d8aca2f96813 ("tty: serial: qcom-geni-serial: stop operations in progress at shutdown")
Fixes: 947cc4ecc06c ("serial: qcom-geni: fix soft lockup on sw flow control and suspend")
Cc: stable(a)vger.kernel.org # 6.3
Cc: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/tty/serial/qcom_geni_serial.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index 9ea6bd09e665..b6a8729cee6d 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -1096,10 +1096,12 @@ static void qcom_geni_serial_shutdown(struct uart_port *uport)
{
disable_irq(uport->irq);
+ uart_port_lock_irq(uport);
qcom_geni_serial_stop_tx(uport);
qcom_geni_serial_stop_rx(uport);
qcom_geni_serial_cancel_tx_cmd(uport);
+ uart_port_unlock_irq(uport);
}
static void qcom_geni_serial_flush_buffer(struct uart_port *uport)
--
2.45.2
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x bea07fd63192b61209d48cbb81ef474cc3ee4c62
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101818-ducky-dallying-2814@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bea07fd63192b61209d48cbb81ef474cc3ee4c62 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Mon, 7 Oct 2024 16:28:32 +0100
Subject: [PATCH] maple_tree: correct tree corruption on spanning store
Patch series "maple_tree: correct tree corruption on spanning store", v3.
There has been a nasty yet subtle maple tree corruption bug that appears
to have been in existence since the inception of the algorithm.
This bug seems far more likely to happen since commit f8d112a4e657
("mm/mmap: avoid zeroing vma tree in mmap_region()"), which is the point
at which reports started to be submitted concerning this bug.
We were made definitely aware of the bug thanks to the kind efforts of
Bert Karwatzki who helped enormously in my being able to track this down
and identify the cause of it.
The bug arises when an attempt is made to perform a spanning store across
two leaf nodes, where the right leaf node is the rightmost child of the
shared parent, AND the store completely consumes the right-mode node.
This results in mas_wr_spanning_store() mitakenly duplicating the new and
existing entries at the maximum pivot within the range, and thus maple
tree corruption.
The fix patch corrects this by detecting this scenario and disallowing the
mistaken duplicate copy.
The fix patch commit message goes into great detail as to how this occurs.
This series also includes a test which reliably reproduces the issue, and
asserts that the fix works correctly.
Bert has kindly tested the fix and confirmed it resolved his issues. Also
Mikhail Gavrilov kindly reported what appears to be precisely the same
bug, which this fix should also resolve.
This patch (of 2):
There has been a subtle bug present in the maple tree implementation from
its inception.
This arises from how stores are performed - when a store occurs, it will
overwrite overlapping ranges and adjust the tree as necessary to
accommodate this.
A range may always ultimately span two leaf nodes. In this instance we
walk the two leaf nodes, determine which elements are not overwritten to
the left and to the right of the start and end of the ranges respectively
and then rebalance the tree to contain these entries and the newly
inserted one.
This kind of store is dubbed a 'spanning store' and is implemented by
mas_wr_spanning_store().
In order to reach this stage, mas_store_gfp() invokes
mas_wr_preallocate(), mas_wr_store_type() and mas_wr_walk() in turn to
walk the tree and update the object (mas) to traverse to the location
where the write should be performed, determining its store type.
When a spanning store is required, this function returns false stopping at
the parent node which contains the target range, and mas_wr_store_type()
marks the mas->store_type as wr_spanning_store to denote this fact.
When we go to perform the store in mas_wr_spanning_store(), we first
determine the elements AFTER the END of the range we wish to store (that
is, to the right of the entry to be inserted) - we do this by walking to
the NEXT pivot in the tree (i.e. r_mas.last + 1), starting at the node we
have just determined contains the range over which we intend to write.
We then turn our attention to the entries to the left of the entry we are
inserting, whose state is represented by l_mas, and copy these into a 'big
node', which is a special node which contains enough slots to contain two
leaf node's worth of data.
We then copy the entry we wish to store immediately after this - the copy
and the insertion of the new entry is performed by mas_store_b_node().
After this we copy the elements to the right of the end of the range which
we are inserting, if we have not exceeded the length of the node (i.e.
r_mas.offset <= r_mas.end).
Herein lies the bug - under very specific circumstances, this logic can
break and corrupt the maple tree.
Consider the following tree:
Height
0 Root Node
/ \
pivot = 0xffff / \ pivot = ULONG_MAX
/ \
1 A [-----] ...
/ \
pivot = 0x4fff / \ pivot = 0xffff
/ \
2 (LEAVES) B [-----] [-----] C
^--- Last pivot 0xffff.
Now imagine we wish to store an entry in the range [0x4000, 0xffff] (note
that all ranges expressed in maple tree code are inclusive):
1. mas_store_gfp() descends the tree, finds node A at <=0xffff, then
determines that this is a spanning store across nodes B and C. The mas
state is set such that the current node from which we traverse further
is node A.
2. In mas_wr_spanning_store() we try to find elements to the right of pivot
0xffff by searching for an index of 0x10000:
- mas_wr_walk_index() invokes mas_wr_walk_descend() and
mas_wr_node_walk() in turn.
- mas_wr_node_walk() loops over entries in node A until EITHER it
finds an entry whose pivot equals or exceeds 0x10000 OR it
reaches the final entry.
- Since no entry has a pivot equal to or exceeding 0x10000, pivot
0xffff is selected, leading to node C.
- mas_wr_walk_traverse() resets the mas state to traverse node C. We
loop around and invoke mas_wr_walk_descend() and mas_wr_node_walk()
in turn once again.
- Again, we reach the last entry in node C, which has a pivot of
0xffff.
3. We then copy the elements to the left of 0x4000 in node B to the big
node via mas_store_b_node(), and insert the new [0x4000, 0xffff] entry
too.
4. We determine whether we have any entries to copy from the right of the
end of the range via - and with r_mas set up at the entry at pivot
0xffff, r_mas.offset <= r_mas.end, and then we DUPLICATE the entry at
pivot 0xffff.
5. BUG! The maple tree is corrupted with a duplicate entry.
This requires a very specific set of circumstances - we must be spanning
the last element in a leaf node, which is the last element in the parent
node.
spanning store across two leaf nodes with a range that ends at that shared
pivot.
A potential solution to this problem would simply be to reset the walk
each time we traverse r_mas, however given the rarity of this situation it
seems that would be rather inefficient.
Instead, this patch detects if the right hand node is populated, i.e. has
anything we need to copy.
We do so by only copying elements from the right of the entry being
inserted when the maximum value present exceeds the last, rather than
basing this on offset position.
The patch also updates some comments and eliminates the unused bool return
value in mas_wr_walk_index().
The work performed in commit f8d112a4e657 ("mm/mmap: avoid zeroing vma
tree in mmap_region()") seems to have made the probability of this event
much more likely, which is the point at which reports started to be
submitted concerning this bug.
The motivation for this change arose from Bert Karwatzki's report of
encountering mm instability after the release of kernel v6.12-rc1 which,
after the use of CONFIG_DEBUG_VM_MAPLE_TREE and similar configuration
options, was identified as maple tree corruption.
After Bert very generously provided his time and ability to reproduce this
event consistently, I was able to finally identify that the issue
discussed in this commit message was occurring for him.
Link: https://lkml.kernel.org/r/cover.1728314402.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/48b349a2a0f7c76e18772712d0997a5e12ab0a3b.17283144…
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Bert Karwatzki <spasswolf(a)web.de>
Closes: https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/
Tested-by: Bert Karwatzki <spasswolf(a)web.de>
Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
Closes: https://lore.kernel.org/all/CABXGCsOPwuoNOqSMmAvWO2Fz4TEmPnjFj-b7iF+XFRu1h7…
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
Reviewed-by: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Sidhartha Kumar <sidhartha.kumar(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index ce7c7a7a8258..3619301dda2e 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -2196,6 +2196,8 @@ static inline void mas_node_or_none(struct ma_state *mas,
/*
* mas_wr_node_walk() - Find the correct offset for the index in the @mas.
+ * If @mas->index cannot be found within the containing
+ * node, we traverse to the last entry in the node.
* @wr_mas: The maple write state
*
* Uses mas_slot_locked() and does not need to worry about dead nodes.
@@ -3532,7 +3534,7 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
return true;
}
-static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
+static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
@@ -3541,11 +3543,9 @@ static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
mas->offset);
if (ma_is_leaf(wr_mas->type))
- return true;
+ return;
mas_wr_walk_traverse(wr_mas);
-
}
- return true;
}
/*
* mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
@@ -3765,8 +3765,8 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
memset(&b_node, 0, sizeof(struct maple_big_node));
/* Copy l_mas and store the value in b_node. */
mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
- /* Copy r_mas into b_node. */
- if (r_mas.offset <= r_mas.end)
+ /* Copy r_mas into b_node if there is anything to copy. */
+ if (r_mas.max > r_mas.last)
mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
&b_node, b_node.b_end + 1);
else
Svacer reports redundant comparison in cdns_xfer_msg(). The problem is
present in 5.10 stable release and can be fixed by the following
upstream patch that can be cleanly applied to 5.10 stable branch.
From: Johannes Berg <johannes.berg(a)intel.com>
[ Upstream commit 31db78a4923ef5e2008f2eed321811ca79e7f71b ]
When ieee80211_key_link() is called by ieee80211_gtk_rekey_add()
but returns 0 due to KRACK protection (identical key reinstall),
ieee80211_gtk_rekey_add() will still return a pointer into the
key, in a potential use-after-free. This normally doesn't happen
since it's only called by iwlwifi in case of WoWLAN rekey offload
which has its own KRACK protection, but still better to fix, do
that by returning an error code and converting that to success on
the cfg80211 boundary only, leaving the error for bad callers of
ieee80211_gtk_rekey_add().
Reported-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Fixes: fdf7cb4185b6 ("mac80211: accept key reinstall without changing anything")
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
[Sherry: bp to fix CVE-2023-52530, resolved minor conflicts in
net/mac80211/cfg.c because of context change due to missing commit
23a5f0af6ff4 ("wifi: mac80211: remove cipher scheme support")
ccdde7c74ffd ("wifi: mac80211: properly implement MLO key handling")]
Signed-off-by: Sherry Yang <sherry.yang(a)oracle.com>
---
net/mac80211/cfg.c | 3 +++
net/mac80211/key.c | 2 +-
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f652982a106b..c54b3be62c0a 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -511,6 +511,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
sta->cipher_scheme = cs;
err = ieee80211_key_link(key, sdata, sta);
+ /* KRACK protection, shouldn't happen but just silently accept key */
+ if (err == -EALREADY)
+ err = 0;
out_unlock:
mutex_unlock(&local->sta_mtx);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index f695fc80088b..7b427e39831b 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -843,7 +843,7 @@ int ieee80211_key_link(struct ieee80211_key *key,
*/
if (ieee80211_key_identical(sdata, old_key, key)) {
ieee80211_key_free_unused(key);
- ret = 0;
+ ret = -EALREADY;
goto out;
}
--
2.46.0
[ Upstream commit 7c2fd76048e95dd267055b5f5e0a48e6e7c81fd9 ]
On an NVMe namespace that does not support metadata, it is possible to
send an IO command with metadata through io-passthru. This allows issues
like [1] to trigger in the completion code path.
nvme_map_user_request() doesn't check if the namespace supports metadata
before sending it forward. It also allows admin commands with metadata to
be processed as it ignores metadata when bdev == NULL and may report
success.
Reject an IO command with metadata when the NVMe namespace doesn't
support it and reject an admin command if it has metadata.
[1] https://lore.kernel.org/all/mb61pcylvnym8.fsf@amazon.com/
Suggested-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Sagi Grimberg <sagi(a)grimberg.me>
Reviewed-by: Anuj Gupta <anuj20.g(a)samsung.com>
Signed-off-by: Keith Busch <kbusch(a)kernel.org>
[ Minor changes to make it work on 6.1 ]
Signed-off-by: Puranjay Mohan <pjy(a)amazon.com>
---
drivers/nvme/host/ioctl.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index b3e322e4ade38..a02873792890e 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -3,6 +3,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
* Copyright (c) 2017-2021 Christoph Hellwig.
*/
+#include <linux/blk-integrity.h>
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
#include <linux/io_uring.h>
@@ -95,10 +96,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
+ bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
+ bool has_metadata = meta_buffer && meta_len;
struct bio *bio = NULL;
void *meta = NULL;
int ret;
+ if (has_metadata && !supports_metadata)
+ return -EINVAL;
+
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
struct iov_iter iter;
@@ -122,7 +128,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
if (bdev)
bio_set_dev(bio, bdev);
- if (bdev && meta_buffer && meta_len) {
+ if (has_metadata) {
meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
meta_seed);
if (IS_ERR(meta)) {
--
2.40.1
Upstream commit c2368b19807a ("net: devlink: introduce "unregistering"
mark and use it during devlinks iteration") in v6.0 introduced a race
when unregistering a devlink instance that can result in RCU stalls and
in the system completely locking up. Exact details and reproducer can be
found here [1]. The bug was inadvertently fixed in v6.3 by upstream
commit d77278196441 ("devlink: bump the instance index directly when
iterating").
This patchset fixes the bug by backporting the second commit and a
related dependency from v6.3 to v6.1.y while adjusting them to the
devlink file structure in v6.1.y (net/devlink/{core.c,devl_internal.h}
-> net/devlink/leftover.c).
Tested by running the devlink tests under
tools/testing/selftests/drivers/net/netdevsim/ and the reproducer
mentioned in [1].
[1] https://lore.kernel.org/stable/20241001112035.973187-1-idosch@nvidia.com/
Jakub Kicinski (2):
devlink: drop the filter argument from devlinks_xa_find_get
devlink: bump the instance index directly when iterating
net/devlink/leftover.c | 40 ++++++++++------------------------------
1 file changed, 10 insertions(+), 30 deletions(-)
--
2.47.0
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 532b53cebe58f34ce1c0f34d866f5c0e335c53c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101412-prowling-snowflake-9fe0@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
532b53cebe58 ("secretmem: disable memfd_secret() if arch cannot set direct map")
f7c5b1aab5ef ("mm/secretmem: remove reduntant return value")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 532b53cebe58f34ce1c0f34d866f5c0e335c53c6 Mon Sep 17 00:00:00 2001
From: Patrick Roy <roypat(a)amazon.co.uk>
Date: Tue, 1 Oct 2024 09:00:41 +0100
Subject: [PATCH] secretmem: disable memfd_secret() if arch cannot set direct
map
Return -ENOSYS from memfd_secret() syscall if !can_set_direct_map(). This
is the case for example on some arm64 configurations, where marking 4k
PTEs in the direct map not present can only be done if the direct map is
set up at 4k granularity in the first place (as ARM's break-before-make
semantics do not easily allow breaking apart large/gigantic pages).
More precisely, on arm64 systems with !can_set_direct_map(),
set_direct_map_invalid_noflush() is a no-op, however it returns success
(0) instead of an error. This means that memfd_secret will seemingly
"work" (e.g. syscall succeeds, you can mmap the fd and fault in pages),
but it does not actually achieve its goal of removing its memory from the
direct map.
Note that with this patch, memfd_secret() will start erroring on systems
where can_set_direct_map() returns false (arm64 with
CONFIG_RODATA_FULL_DEFAULT_ENABLED=n, CONFIG_DEBUG_PAGEALLOC=n and
CONFIG_KFENCE=n), but that still seems better than the current silent
failure. Since CONFIG_RODATA_FULL_DEFAULT_ENABLED defaults to 'y', most
arm64 systems actually have a working memfd_secret() and aren't be
affected.
From going through the iterations of the original memfd_secret patch
series, it seems that disabling the syscall in these scenarios was the
intended behavior [1] (preferred over having
set_direct_map_invalid_noflush return an error as that would result in
SIGBUSes at page-fault time), however the check for it got dropped between
v16 [2] and v17 [3], when secretmem moved away from CMA allocations.
[1]: https://lore.kernel.org/lkml/20201124164930.GK8537@kernel.org/
[2]: https://lore.kernel.org/lkml/20210121122723.3446-11-rppt@kernel.org/#t
[3]: https://lore.kernel.org/lkml/20201125092208.12544-10-rppt@kernel.org/
Link: https://lkml.kernel.org/r/20241001080056.784735-1-roypat@amazon.co.uk
Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
Signed-off-by: Patrick Roy <roypat(a)amazon.co.uk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Cc: Alexander Graf <graf(a)amazon.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: James Gowans <jgowans(a)amazon.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 3afb5ad701e1..399552814fd0 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -238,7 +238,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
/* make sure local flags do not confict with global fcntl.h */
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
- if (!secretmem_enable)
+ if (!secretmem_enable || !can_set_direct_map())
return -ENOSYS;
if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
@@ -280,7 +280,7 @@ static struct file_system_type secretmem_fs = {
static int __init secretmem_init(void)
{
- if (!secretmem_enable)
+ if (!secretmem_enable || !can_set_direct_map())
return 0;
secretmem_mnt = kern_mount(&secretmem_fs);
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 2b0f922323ccfa76219bcaacd35cd50aeaa1359
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101841-keep-coma-4963@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b0f922323ccfa76219bcaacd35cd50aeaa13592 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david(a)redhat.com>
Date: Fri, 11 Oct 2024 12:24:45 +0200
Subject: [PATCH] mm: don't install PMD mappings when THPs are disabled by the
hw/process/vma
We (or rather, readahead logic :) ) might be allocating a THP in the
pagecache and then try mapping it into a process that explicitly disabled
THP: we might end up installing PMD mappings.
This is a problem for s390x KVM, which explicitly remaps all PMD-mapped
THPs to be PTE-mapped in s390_enable_sie()->thp_split_mm(), before
starting the VM.
For example, starting a VM backed on a file system with large folios
supported makes the VM crash when the VM tries accessing such a mapping
using KVM.
Is it also a problem when the HW disabled THP using
TRANSPARENT_HUGEPAGE_UNSUPPORTED? At least on x86 this would be the case
without X86_FEATURE_PSE.
In the future, we might be able to do better on s390x and only disallow
PMD mappings -- what s390x and likely TRANSPARENT_HUGEPAGE_UNSUPPORTED
really wants. For now, fix it by essentially performing the same check as
would be done in __thp_vma_allowable_orders() or in shmem code, where this
works as expected, and disallow PMD mappings, making us fallback to PTE
mappings.
Link: https://lkml.kernel.org/r/20241011102445.934409-3-david@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reported-by: Leo Fu <bfu(a)redhat.com>
Tested-by: Thomas Huth <thuth(a)redhat.com>
Cc: Thomas Huth <thuth(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Janosch Frank <frankja(a)linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memory.c b/mm/memory.c
index c0869a962ddd..30feedabc932 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4920,6 +4920,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
pmd_t entry;
vm_fault_t ret = VM_FAULT_FALLBACK;
+ /*
+ * It is too late to allocate a small folio, we already have a large
+ * folio in the pagecache: especially s390 KVM cannot tolerate any
+ * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
+ * PMD mappings if THPs are disabled.
+ */
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ return ret;
+
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
return ret;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 963756aac1f011d904ddd9548ae82286d3a91f96
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101848-lucid-mountain-2cdf@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 963756aac1f011d904ddd9548ae82286d3a91f96 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Date: Fri, 11 Oct 2024 12:24:44 +0200
Subject: [PATCH] mm: huge_memory: add vma_thp_disabled() and
thp_disabled_by_hw()
Patch series "mm: don't install PMD mappings when THPs are disabled by the
hw/process/vma".
During testing, it was found that we can get PMD mappings in processes
where THP (and more precisely, PMD mappings) are supposed to be disabled.
While it works as expected for anon+shmem, the pagecache is the
problematic bit.
For s390 KVM this currently means that a VM backed by a file located on
filesystem with large folio support can crash when KVM tries accessing the
problematic page, because the readahead logic might decide to use a
PMD-sized THP and faulting it into the page tables will install a PMD
mapping, something that s390 KVM cannot tolerate.
This might also be a problem with HW that does not support PMD mappings,
but I did not try reproducing it.
Fix it by respecting the ways to disable THPs when deciding whether we can
install a PMD mapping. khugepaged should already be taking care of not
collapsing if THPs are effectively disabled for the hw/process/vma.
This patch (of 2):
Add vma_thp_disabled() and thp_disabled_by_hw() helpers to be shared by
shmem_allowable_huge_orders() and __thp_vma_allowable_orders().
[david(a)redhat.com: rename to vma_thp_disabled(), split out thp_disabled_by_hw() ]
Link: https://lkml.kernel.org/r/20241011102445.934409-2-david@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reported-by: Leo Fu <bfu(a)redhat.com>
Tested-by: Thomas Huth <thuth(a)redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Boqiao Fu <bfu(a)redhat.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Janosch Frank <frankja(a)linux.ibm.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 67d0ab3c3bba..ef5b80e48599 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -322,6 +322,24 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+static inline bool vma_thp_disabled(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ /*
+ * Explicitly disabled through madvise or prctl, or some
+ * architectures may disable THP for some mappings, for
+ * example, s390 kvm.
+ */
+ return (vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+}
+
+static inline bool thp_disabled_by_hw(void)
+{
+ /* If the hardware/firmware marked hugepage support disabled. */
+ return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED);
+}
+
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87b49ecc7b1e..2fb328880b50 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -109,18 +109,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- /*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
- * */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return 0;
- /*
- * If the hardware/firmware marked hugepage support disabled.
- */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
diff --git a/mm/shmem.c b/mm/shmem.c
index 4f11b5506363..c5adb987b23c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1664,12 +1664,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
loff_t i_size;
int order;
- if (vma && ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
- return 0;
-
- /* If the hardware/firmware marked hugepage support disabled. */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
global_huge = shmem_huge_global_enabled(inode, index, write_end,
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 963756aac1f011d904ddd9548ae82286d3a91f96
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101842-flatness-osmosis-b08e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 963756aac1f011d904ddd9548ae82286d3a91f96 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Date: Fri, 11 Oct 2024 12:24:44 +0200
Subject: [PATCH] mm: huge_memory: add vma_thp_disabled() and
thp_disabled_by_hw()
Patch series "mm: don't install PMD mappings when THPs are disabled by the
hw/process/vma".
During testing, it was found that we can get PMD mappings in processes
where THP (and more precisely, PMD mappings) are supposed to be disabled.
While it works as expected for anon+shmem, the pagecache is the
problematic bit.
For s390 KVM this currently means that a VM backed by a file located on
filesystem with large folio support can crash when KVM tries accessing the
problematic page, because the readahead logic might decide to use a
PMD-sized THP and faulting it into the page tables will install a PMD
mapping, something that s390 KVM cannot tolerate.
This might also be a problem with HW that does not support PMD mappings,
but I did not try reproducing it.
Fix it by respecting the ways to disable THPs when deciding whether we can
install a PMD mapping. khugepaged should already be taking care of not
collapsing if THPs are effectively disabled for the hw/process/vma.
This patch (of 2):
Add vma_thp_disabled() and thp_disabled_by_hw() helpers to be shared by
shmem_allowable_huge_orders() and __thp_vma_allowable_orders().
[david(a)redhat.com: rename to vma_thp_disabled(), split out thp_disabled_by_hw() ]
Link: https://lkml.kernel.org/r/20241011102445.934409-2-david@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reported-by: Leo Fu <bfu(a)redhat.com>
Tested-by: Thomas Huth <thuth(a)redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Boqiao Fu <bfu(a)redhat.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Janosch Frank <frankja(a)linux.ibm.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 67d0ab3c3bba..ef5b80e48599 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -322,6 +322,24 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+static inline bool vma_thp_disabled(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ /*
+ * Explicitly disabled through madvise or prctl, or some
+ * architectures may disable THP for some mappings, for
+ * example, s390 kvm.
+ */
+ return (vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+}
+
+static inline bool thp_disabled_by_hw(void)
+{
+ /* If the hardware/firmware marked hugepage support disabled. */
+ return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED);
+}
+
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 87b49ecc7b1e..2fb328880b50 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -109,18 +109,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- /*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
- * */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return 0;
- /*
- * If the hardware/firmware marked hugepage support disabled.
- */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
diff --git a/mm/shmem.c b/mm/shmem.c
index 4f11b5506363..c5adb987b23c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1664,12 +1664,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
loff_t i_size;
int order;
- if (vma && ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
- return 0;
-
- /* If the hardware/firmware marked hugepage support disabled. */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
global_huge = shmem_huge_global_enabled(inode, index, write_end,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 7528c4fb1237512ee18049f852f014eba80bbe8d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101858-rewire-vocation-c981@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7528c4fb1237512ee18049f852f014eba80bbe8d Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2(a)huawei.com>
Date: Tue, 15 Oct 2024 09:45:21 +0800
Subject: [PATCH] mm/swapfile: skip HugeTLB pages for unuse_vma
I got a bad pud error and lost a 1GB HugeTLB when calling swapoff. The
problem can be reproduced by the following steps:
1. Allocate an anonymous 1GB HugeTLB and some other anonymous memory.
2. Swapout the above anonymous memory.
3. run swapoff and we will get a bad pud error in kernel message:
mm/pgtable-generic.c:42: bad pud 00000000743d215d(84000001400000e7)
We can tell that pud_clear_bad is called by pud_none_or_clear_bad in
unuse_pud_range() by ftrace. And therefore the HugeTLB pages will never
be freed because we lost it from page table. We can skip HugeTLB pages
for unuse_vma to fix it.
Link: https://lkml.kernel.org/r/20241015014521.570237-1-liushixin2@huawei.com
Fixes: 0fe6e20b9c4c ("hugetlb, rmap: add reverse mapping for hugepage")
Signed-off-by: Liu Shixin <liushixin2(a)huawei.com>
Acked-by: Muchun Song <muchun.song(a)linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eb782fcd5627..b0915f3fab31 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2313,7 +2313,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- if (vma->anon_vma) {
+ if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
ret = unuse_vma(vma, type);
if (ret)
break;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 37f0b47c5143c2957909ced44fc09ffb118c99f7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101803-cage-smokiness-cb8b@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 37f0b47c5143c2957909ced44fc09ffb118c99f7 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang(a)os.amperecomputing.com>
Date: Fri, 11 Oct 2024 18:17:02 -0700
Subject: [PATCH] mm: khugepaged: fix the arguments order in
khugepaged_collapse_file trace point
The "addr" and "is_shmem" arguments have different order in TP_PROTO and
TP_ARGS. This resulted in the incorrect trace result:
text-hugepage-644429 [276] 392092.878683: mm_khugepaged_collapse_file:
mm=0xffff20025d52c440, hpage_pfn=0x200678c00, index=512, addr=1, is_shmem=0,
filename=text-hugepage, nr=512, result=failed
The value of "addr" is wrong because it was treated as bool value, the
type of is_shmem.
Fix the order in TP_PROTO to keep "addr" is before "is_shmem" since the
original patch review suggested this order to achieve best packing.
And use "lx" for "addr" instead of "ld" in TP_printk because address is
typically shown in hex.
After the fix, the trace result looks correct:
text-hugepage-7291 [004] 128.627251: mm_khugepaged_collapse_file:
mm=0xffff0001328f9500, hpage_pfn=0x20016ea00, index=512, addr=0x400000,
is_shmem=0, filename=text-hugepage, nr=512, result=failed
Link: https://lkml.kernel.org/r/20241012011702.1084846-1-yang@os.amperecomputing.…
Fixes: 4c9473e87e75 ("mm/khugepaged: add tracepoint to collapse_file()")
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
Cc: Gautam Menghani <gautammenghani201(a)gmail.com>
Cc: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Cc: <stable(a)vger.kernel.org> [6.2+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index b5f5369b6300..9d5c00b0285c 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -208,7 +208,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
TRACE_EVENT(mm_khugepaged_collapse_file,
TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
- bool is_shmem, unsigned long addr, struct file *file,
+ unsigned long addr, bool is_shmem, struct file *file,
int nr, int result),
TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
TP_STRUCT__entry(
@@ -233,7 +233,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
__entry->result = result;
),
- TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s",
+ TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
__entry->mm,
__entry->hpfn,
__entry->index,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f9c39898eaff..a420eff92011 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2227,7 +2227,7 @@ rollback:
folio_put(new_folio);
out:
VM_BUG_ON(!list_empty(&pagelist));
- trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
+ trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
return result;
}
Hi Greg, Sasha,
Could you please help to backport the upstream commit
80e9963fb3b5509dfcabe9652d56bf4b35542055 ("irqchip/gic-v3-its: Fix VSYNC
referencing an unmapped VPE on GIC v4.1") to
* 5.10
* 5.15
* 6.1
* 6.6
trees? It can be applied and built (with arm64's defconfig) cleanly on
top of the mentioned stable branches.
Thanks,
Zenghui
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5afca7e996c42aed1b4a42d4712817601ba42aff
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101827-regulate-lining-6c3e@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5afca7e996c42aed1b4a42d4712817601ba42aff Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Mon, 14 Oct 2024 16:06:01 +0200
Subject: [PATCH] selftests: mptcp: join: test for prohibited MPC to port-based
endp
Explicitly verify that MPC connection attempts towards a port-based
signal endpoint fail with a reset.
Note that this new test is a bit different from the other ones, not
using 'run_tests'. It is then needed to add the capture capability, and
the picking the right port which have been extracted into three new
helpers. The info about the capture can also be printed from a single
point, which simplifies the exit paths in do_transfer().
The 'Fixes' tag here below is the same as the one from the previous
commit: this patch here is not fixing anything wrong in the selftests,
but it validates the previous fix for an issue introduced by this commit
ID.
Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
Cc: stable(a)vger.kernel.org
Co-developed-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-2-7faea8e6b6ae…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index e8d0a01b4144..c07e2bd3a315 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -23,6 +23,7 @@ tmpfile=""
cout=""
err=""
capout=""
+cappid=""
ns1=""
ns2=""
iptables="iptables"
@@ -887,6 +888,44 @@ check_cestab()
fi
}
+cond_start_capture()
+{
+ local ns="$1"
+
+ :> "$capout"
+
+ if $capture; then
+ local capuser capfile
+ if [ -z $SUDO_USER ]; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "$ns")
+
+ echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
+ ip netns exec "$ns" tcpdump -i any -s 65535 -B 32768 $capuser -w "$capfile" > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+ fi
+}
+
+cond_stop_capture()
+{
+ if $capture; then
+ sleep 1
+ kill $cappid
+ cat "$capout"
+ fi
+}
+
+get_port()
+{
+ echo "$((10000 + MPTCP_LIB_TEST_COUNTER - 1))"
+}
+
do_transfer()
{
local listener_ns="$1"
@@ -894,33 +933,17 @@ do_transfer()
local cl_proto="$3"
local srv_proto="$4"
local connect_addr="$5"
+ local port
- local port=$((10000 + MPTCP_LIB_TEST_COUNTER - 1))
- local cappid
local FAILING_LINKS=${FAILING_LINKS:-""}
local fastclose=${fastclose:-""}
local speed=${speed:-"fast"}
+ port=$(get_port)
:> "$cout"
:> "$sout"
- :> "$capout"
- if $capture; then
- local capuser
- if [ -z $SUDO_USER ] ; then
- capuser=""
- else
- capuser="-Z $SUDO_USER"
- fi
-
- capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "${listener_ns}")
-
- echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
- ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
- cappid=$!
-
- sleep 1
- fi
+ cond_start_capture ${listener_ns}
NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
nstat -n
@@ -1007,10 +1030,7 @@ do_transfer()
wait $spid
local rets=$?
- if $capture; then
- sleep 1
- kill $cappid
- fi
+ cond_stop_capture
NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
nstat | grep Tcp > /tmp/${listener_ns}.out
@@ -1026,7 +1046,6 @@ do_transfer()
ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
cat /tmp/${connector_ns}.out
- cat "$capout"
return 1
fi
@@ -1043,13 +1062,7 @@ do_transfer()
fi
rets=$?
- if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
- cat "$capout"
- return 0
- fi
-
- cat "$capout"
- return 1
+ [ $retc -eq 0 ] && [ $rets -eq 0 ]
}
make_file()
@@ -2873,6 +2886,32 @@ verify_listener_events()
fail_test
}
+chk_mpc_endp_attempt()
+{
+ local retl=$1
+ local attempts=$2
+
+ print_check "Connect"
+
+ if [ ${retl} = 124 ]; then
+ fail_test "timeout on connect"
+ elif [ ${retl} = 0 ]; then
+ fail_test "unexpected successful connect"
+ else
+ print_ok
+
+ print_check "Attempts"
+ count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPCapableEndpAttempt")
+ if [ -z "$count" ]; then
+ print_skip
+ elif [ "$count" != "$attempts" ]; then
+ fail_test "got ${count} MPC attempt[s] on port-based endpoint, expected ${attempts}"
+ else
+ print_ok
+ fi
+ fi
+}
+
add_addr_ports_tests()
{
# signal address with port
@@ -2963,6 +3002,22 @@ add_addr_ports_tests()
chk_join_nr 2 2 2
chk_add_nr 2 2 2
fi
+
+ if reset "port-based signal endpoint must not accept mpc"; then
+ local port retl count
+ port=$(get_port)
+
+ cond_start_capture ${ns1}
+ pm_nl_add_endpoint ${ns1} 10.0.2.1 flags signal port ${port}
+ mptcp_lib_wait_local_port_listen ${ns1} ${port}
+
+ timeout 1 ip netns exec ${ns2} \
+ ./mptcp_connect -t ${timeout_poll} -p $port -s MPTCP 10.0.2.1 >/dev/null 2>&1
+ retl=$?
+ cond_stop_capture
+
+ chk_mpc_endp_attempt ${retl} 1
+ fi
}
syncookies_tests()
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 5afca7e996c42aed1b4a42d4712817601ba42aff
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101826-outshine-powdered-a548@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5afca7e996c42aed1b4a42d4712817601ba42aff Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Mon, 14 Oct 2024 16:06:01 +0200
Subject: [PATCH] selftests: mptcp: join: test for prohibited MPC to port-based
endp
Explicitly verify that MPC connection attempts towards a port-based
signal endpoint fail with a reset.
Note that this new test is a bit different from the other ones, not
using 'run_tests'. It is then needed to add the capture capability, and
the picking the right port which have been extracted into three new
helpers. The info about the capture can also be printed from a single
point, which simplifies the exit paths in do_transfer().
The 'Fixes' tag here below is the same as the one from the previous
commit: this patch here is not fixing anything wrong in the selftests,
but it validates the previous fix for an issue introduced by this commit
ID.
Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
Cc: stable(a)vger.kernel.org
Co-developed-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-2-7faea8e6b6ae…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index e8d0a01b4144..c07e2bd3a315 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -23,6 +23,7 @@ tmpfile=""
cout=""
err=""
capout=""
+cappid=""
ns1=""
ns2=""
iptables="iptables"
@@ -887,6 +888,44 @@ check_cestab()
fi
}
+cond_start_capture()
+{
+ local ns="$1"
+
+ :> "$capout"
+
+ if $capture; then
+ local capuser capfile
+ if [ -z $SUDO_USER ]; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "$ns")
+
+ echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
+ ip netns exec "$ns" tcpdump -i any -s 65535 -B 32768 $capuser -w "$capfile" > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+ fi
+}
+
+cond_stop_capture()
+{
+ if $capture; then
+ sleep 1
+ kill $cappid
+ cat "$capout"
+ fi
+}
+
+get_port()
+{
+ echo "$((10000 + MPTCP_LIB_TEST_COUNTER - 1))"
+}
+
do_transfer()
{
local listener_ns="$1"
@@ -894,33 +933,17 @@ do_transfer()
local cl_proto="$3"
local srv_proto="$4"
local connect_addr="$5"
+ local port
- local port=$((10000 + MPTCP_LIB_TEST_COUNTER - 1))
- local cappid
local FAILING_LINKS=${FAILING_LINKS:-""}
local fastclose=${fastclose:-""}
local speed=${speed:-"fast"}
+ port=$(get_port)
:> "$cout"
:> "$sout"
- :> "$capout"
- if $capture; then
- local capuser
- if [ -z $SUDO_USER ] ; then
- capuser=""
- else
- capuser="-Z $SUDO_USER"
- fi
-
- capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "${listener_ns}")
-
- echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
- ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
- cappid=$!
-
- sleep 1
- fi
+ cond_start_capture ${listener_ns}
NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
nstat -n
@@ -1007,10 +1030,7 @@ do_transfer()
wait $spid
local rets=$?
- if $capture; then
- sleep 1
- kill $cappid
- fi
+ cond_stop_capture
NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
nstat | grep Tcp > /tmp/${listener_ns}.out
@@ -1026,7 +1046,6 @@ do_transfer()
ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
cat /tmp/${connector_ns}.out
- cat "$capout"
return 1
fi
@@ -1043,13 +1062,7 @@ do_transfer()
fi
rets=$?
- if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
- cat "$capout"
- return 0
- fi
-
- cat "$capout"
- return 1
+ [ $retc -eq 0 ] && [ $rets -eq 0 ]
}
make_file()
@@ -2873,6 +2886,32 @@ verify_listener_events()
fail_test
}
+chk_mpc_endp_attempt()
+{
+ local retl=$1
+ local attempts=$2
+
+ print_check "Connect"
+
+ if [ ${retl} = 124 ]; then
+ fail_test "timeout on connect"
+ elif [ ${retl} = 0 ]; then
+ fail_test "unexpected successful connect"
+ else
+ print_ok
+
+ print_check "Attempts"
+ count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPCapableEndpAttempt")
+ if [ -z "$count" ]; then
+ print_skip
+ elif [ "$count" != "$attempts" ]; then
+ fail_test "got ${count} MPC attempt[s] on port-based endpoint, expected ${attempts}"
+ else
+ print_ok
+ fi
+ fi
+}
+
add_addr_ports_tests()
{
# signal address with port
@@ -2963,6 +3002,22 @@ add_addr_ports_tests()
chk_join_nr 2 2 2
chk_add_nr 2 2 2
fi
+
+ if reset "port-based signal endpoint must not accept mpc"; then
+ local port retl count
+ port=$(get_port)
+
+ cond_start_capture ${ns1}
+ pm_nl_add_endpoint ${ns1} 10.0.2.1 flags signal port ${port}
+ mptcp_lib_wait_local_port_listen ${ns1} ${port}
+
+ timeout 1 ip netns exec ${ns2} \
+ ./mptcp_connect -t ${timeout_poll} -p $port -s MPTCP 10.0.2.1 >/dev/null 2>&1
+ retl=$?
+ cond_stop_capture
+
+ chk_mpc_endp_attempt ${retl} 1
+ fi
}
syncookies_tests()
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 5afca7e996c42aed1b4a42d4712817601ba42aff
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101826-algorithm-figure-3cf3@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5afca7e996c42aed1b4a42d4712817601ba42aff Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Mon, 14 Oct 2024 16:06:01 +0200
Subject: [PATCH] selftests: mptcp: join: test for prohibited MPC to port-based
endp
Explicitly verify that MPC connection attempts towards a port-based
signal endpoint fail with a reset.
Note that this new test is a bit different from the other ones, not
using 'run_tests'. It is then needed to add the capture capability, and
the picking the right port which have been extracted into three new
helpers. The info about the capture can also be printed from a single
point, which simplifies the exit paths in do_transfer().
The 'Fixes' tag here below is the same as the one from the previous
commit: this patch here is not fixing anything wrong in the selftests,
but it validates the previous fix for an issue introduced by this commit
ID.
Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port")
Cc: stable(a)vger.kernel.org
Co-developed-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-2-7faea8e6b6ae…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index e8d0a01b4144..c07e2bd3a315 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -23,6 +23,7 @@ tmpfile=""
cout=""
err=""
capout=""
+cappid=""
ns1=""
ns2=""
iptables="iptables"
@@ -887,6 +888,44 @@ check_cestab()
fi
}
+cond_start_capture()
+{
+ local ns="$1"
+
+ :> "$capout"
+
+ if $capture; then
+ local capuser capfile
+ if [ -z $SUDO_USER ]; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "$ns")
+
+ echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
+ ip netns exec "$ns" tcpdump -i any -s 65535 -B 32768 $capuser -w "$capfile" > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+ fi
+}
+
+cond_stop_capture()
+{
+ if $capture; then
+ sleep 1
+ kill $cappid
+ cat "$capout"
+ fi
+}
+
+get_port()
+{
+ echo "$((10000 + MPTCP_LIB_TEST_COUNTER - 1))"
+}
+
do_transfer()
{
local listener_ns="$1"
@@ -894,33 +933,17 @@ do_transfer()
local cl_proto="$3"
local srv_proto="$4"
local connect_addr="$5"
+ local port
- local port=$((10000 + MPTCP_LIB_TEST_COUNTER - 1))
- local cappid
local FAILING_LINKS=${FAILING_LINKS:-""}
local fastclose=${fastclose:-""}
local speed=${speed:-"fast"}
+ port=$(get_port)
:> "$cout"
:> "$sout"
- :> "$capout"
- if $capture; then
- local capuser
- if [ -z $SUDO_USER ] ; then
- capuser=""
- else
- capuser="-Z $SUDO_USER"
- fi
-
- capfile=$(printf "mp_join-%02u-%s.pcap" "$MPTCP_LIB_TEST_COUNTER" "${listener_ns}")
-
- echo "Capturing traffic for test $MPTCP_LIB_TEST_COUNTER into $capfile"
- ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
- cappid=$!
-
- sleep 1
- fi
+ cond_start_capture ${listener_ns}
NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
nstat -n
@@ -1007,10 +1030,7 @@ do_transfer()
wait $spid
local rets=$?
- if $capture; then
- sleep 1
- kill $cappid
- fi
+ cond_stop_capture
NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \
nstat | grep Tcp > /tmp/${listener_ns}.out
@@ -1026,7 +1046,6 @@ do_transfer()
ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port"
cat /tmp/${connector_ns}.out
- cat "$capout"
return 1
fi
@@ -1043,13 +1062,7 @@ do_transfer()
fi
rets=$?
- if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
- cat "$capout"
- return 0
- fi
-
- cat "$capout"
- return 1
+ [ $retc -eq 0 ] && [ $rets -eq 0 ]
}
make_file()
@@ -2873,6 +2886,32 @@ verify_listener_events()
fail_test
}
+chk_mpc_endp_attempt()
+{
+ local retl=$1
+ local attempts=$2
+
+ print_check "Connect"
+
+ if [ ${retl} = 124 ]; then
+ fail_test "timeout on connect"
+ elif [ ${retl} = 0 ]; then
+ fail_test "unexpected successful connect"
+ else
+ print_ok
+
+ print_check "Attempts"
+ count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPCapableEndpAttempt")
+ if [ -z "$count" ]; then
+ print_skip
+ elif [ "$count" != "$attempts" ]; then
+ fail_test "got ${count} MPC attempt[s] on port-based endpoint, expected ${attempts}"
+ else
+ print_ok
+ fi
+ fi
+}
+
add_addr_ports_tests()
{
# signal address with port
@@ -2963,6 +3002,22 @@ add_addr_ports_tests()
chk_join_nr 2 2 2
chk_add_nr 2 2 2
fi
+
+ if reset "port-based signal endpoint must not accept mpc"; then
+ local port retl count
+ port=$(get_port)
+
+ cond_start_capture ${ns1}
+ pm_nl_add_endpoint ${ns1} 10.0.2.1 flags signal port ${port}
+ mptcp_lib_wait_local_port_listen ${ns1} ${port}
+
+ timeout 1 ip netns exec ${ns2} \
+ ./mptcp_connect -t ${timeout_poll} -p $port -s MPTCP 10.0.2.1 >/dev/null 2>&1
+ retl=$?
+ cond_stop_capture
+
+ chk_mpc_endp_attempt ${retl} 1
+ fi
}
syncookies_tests()
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101825-oaf-glaucoma-1d13@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland(a)arm.com>
Date: Tue, 8 Oct 2024 16:58:48 +0100
Subject: [PATCH] arm64: probes: Fix uprobes for big-endian kernels
The arm64 uprobes code is broken for big-endian kernels as it doesn't
convert the in-memory instruction encoding (which is always
little-endian) into the kernel's native endianness before analyzing and
simulating instructions. This may result in a few distinct problems:
* The kernel may may erroneously reject probing an instruction which can
safely be probed.
* The kernel may erroneously erroneously permit stepping an
instruction out-of-line when that instruction cannot be stepped
out-of-line safely.
* The kernel may erroneously simulate instruction incorrectly dur to
interpretting the byte-swapped encoding.
The endianness mismatch isn't caught by the compiler or sparse because:
* The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so
the compiler and sparse have no idea these contain a little-endian
32-bit value. The core uprobes code populates these with a memcpy()
which similarly does not handle endianness.
* While the uprobe_opcode_t type is an alias for __le32, both
arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[]
to the similarly-named probe_opcode_t, which is an alias for u32.
Hence there is no endianness conversion warning.
Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and
adding the appropriate __le32_to_cpu() conversions prior to consuming
the instruction encoding. The core uprobes copies these fields as opaque
ranges of bytes, and so is unaffected by this change.
At the same time, remove MAX_UINSN_BYTES and consistently use
AARCH64_INSN_SIZE for clarity.
Tested with the following:
| #include <stdio.h>
| #include <stdbool.h>
|
| #define noinline __attribute__((noinline))
|
| static noinline void *adrp_self(void)
| {
| void *addr;
|
| asm volatile(
| " adrp %x0, adrp_self\n"
| " add %x0, %x0, :lo12:adrp_self\n"
| : "=r" (addr));
| }
|
|
| int main(int argc, char *argv)
| {
| void *ptr = adrp_self();
| bool equal = (ptr == adrp_self);
|
| printf("adrp_self => %p\n"
| "adrp_self() => %p\n"
| "%s\n",
| adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL");
|
| return 0;
| }
.... where the adrp_self() function was compiled to:
| 00000000004007e0 <adrp_self>:
| 4007e0: 90000000 adrp x0, 400000 <__ehdr_start>
| 4007e4: 911f8000 add x0, x0, #0x7e0
| 4007e8: d65f03c0 ret
Before this patch, the ADRP is not recognized, and is assumed to be
steppable, resulting in corruption of the result:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0xffffffffff7e0
| NOT EQUAL
After this patch, the ADRP is correctly recognized and simulated:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| #
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
Fixes: 9842ceae9fa8 ("arm64: Add uprobe support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
index 2b09495499c6..014b02897f8e 100644
--- a/arch/arm64/include/asm/uprobes.h
+++ b/arch/arm64/include/asm/uprobes.h
@@ -10,11 +10,9 @@
#include <asm/insn.h>
#include <asm/probes.h>
-#define MAX_UINSN_BYTES AARCH64_INSN_SIZE
-
#define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES)
#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE
-#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES
+#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE
typedef __le32 uprobe_opcode_t;
@@ -23,8 +21,8 @@ struct arch_uprobe_task {
struct arch_uprobe {
union {
- u8 insn[MAX_UINSN_BYTES];
- u8 ixol[MAX_UINSN_BYTES];
+ __le32 insn;
+ __le32 ixol;
};
struct arch_probe_insn api;
bool simulate;
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..a2f137a595fc 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE))
return -EINVAL;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
switch (arm_probe_decode_insn(insn, &auprobe->api)) {
case INSN_REJECTED:
@@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (!auprobe->simulate)
return false;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
addr = instruction_pointer(regs);
if (auprobe->api.handler)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101822-deplored-dictator-689d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland(a)arm.com>
Date: Tue, 8 Oct 2024 16:58:48 +0100
Subject: [PATCH] arm64: probes: Fix uprobes for big-endian kernels
The arm64 uprobes code is broken for big-endian kernels as it doesn't
convert the in-memory instruction encoding (which is always
little-endian) into the kernel's native endianness before analyzing and
simulating instructions. This may result in a few distinct problems:
* The kernel may may erroneously reject probing an instruction which can
safely be probed.
* The kernel may erroneously erroneously permit stepping an
instruction out-of-line when that instruction cannot be stepped
out-of-line safely.
* The kernel may erroneously simulate instruction incorrectly dur to
interpretting the byte-swapped encoding.
The endianness mismatch isn't caught by the compiler or sparse because:
* The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so
the compiler and sparse have no idea these contain a little-endian
32-bit value. The core uprobes code populates these with a memcpy()
which similarly does not handle endianness.
* While the uprobe_opcode_t type is an alias for __le32, both
arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[]
to the similarly-named probe_opcode_t, which is an alias for u32.
Hence there is no endianness conversion warning.
Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and
adding the appropriate __le32_to_cpu() conversions prior to consuming
the instruction encoding. The core uprobes copies these fields as opaque
ranges of bytes, and so is unaffected by this change.
At the same time, remove MAX_UINSN_BYTES and consistently use
AARCH64_INSN_SIZE for clarity.
Tested with the following:
| #include <stdio.h>
| #include <stdbool.h>
|
| #define noinline __attribute__((noinline))
|
| static noinline void *adrp_self(void)
| {
| void *addr;
|
| asm volatile(
| " adrp %x0, adrp_self\n"
| " add %x0, %x0, :lo12:adrp_self\n"
| : "=r" (addr));
| }
|
|
| int main(int argc, char *argv)
| {
| void *ptr = adrp_self();
| bool equal = (ptr == adrp_self);
|
| printf("adrp_self => %p\n"
| "adrp_self() => %p\n"
| "%s\n",
| adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL");
|
| return 0;
| }
.... where the adrp_self() function was compiled to:
| 00000000004007e0 <adrp_self>:
| 4007e0: 90000000 adrp x0, 400000 <__ehdr_start>
| 4007e4: 911f8000 add x0, x0, #0x7e0
| 4007e8: d65f03c0 ret
Before this patch, the ADRP is not recognized, and is assumed to be
steppable, resulting in corruption of the result:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0xffffffffff7e0
| NOT EQUAL
After this patch, the ADRP is correctly recognized and simulated:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| #
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
Fixes: 9842ceae9fa8 ("arm64: Add uprobe support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
index 2b09495499c6..014b02897f8e 100644
--- a/arch/arm64/include/asm/uprobes.h
+++ b/arch/arm64/include/asm/uprobes.h
@@ -10,11 +10,9 @@
#include <asm/insn.h>
#include <asm/probes.h>
-#define MAX_UINSN_BYTES AARCH64_INSN_SIZE
-
#define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES)
#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE
-#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES
+#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE
typedef __le32 uprobe_opcode_t;
@@ -23,8 +21,8 @@ struct arch_uprobe_task {
struct arch_uprobe {
union {
- u8 insn[MAX_UINSN_BYTES];
- u8 ixol[MAX_UINSN_BYTES];
+ __le32 insn;
+ __le32 ixol;
};
struct arch_probe_insn api;
bool simulate;
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..a2f137a595fc 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE))
return -EINVAL;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
switch (arm_probe_decode_insn(insn, &auprobe->api)) {
case INSN_REJECTED:
@@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (!auprobe->simulate)
return false;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
addr = instruction_pointer(regs);
if (auprobe->api.handler)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101820-delirious-wrongful-e7f1@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland(a)arm.com>
Date: Tue, 8 Oct 2024 16:58:48 +0100
Subject: [PATCH] arm64: probes: Fix uprobes for big-endian kernels
The arm64 uprobes code is broken for big-endian kernels as it doesn't
convert the in-memory instruction encoding (which is always
little-endian) into the kernel's native endianness before analyzing and
simulating instructions. This may result in a few distinct problems:
* The kernel may may erroneously reject probing an instruction which can
safely be probed.
* The kernel may erroneously erroneously permit stepping an
instruction out-of-line when that instruction cannot be stepped
out-of-line safely.
* The kernel may erroneously simulate instruction incorrectly dur to
interpretting the byte-swapped encoding.
The endianness mismatch isn't caught by the compiler or sparse because:
* The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so
the compiler and sparse have no idea these contain a little-endian
32-bit value. The core uprobes code populates these with a memcpy()
which similarly does not handle endianness.
* While the uprobe_opcode_t type is an alias for __le32, both
arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[]
to the similarly-named probe_opcode_t, which is an alias for u32.
Hence there is no endianness conversion warning.
Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and
adding the appropriate __le32_to_cpu() conversions prior to consuming
the instruction encoding. The core uprobes copies these fields as opaque
ranges of bytes, and so is unaffected by this change.
At the same time, remove MAX_UINSN_BYTES and consistently use
AARCH64_INSN_SIZE for clarity.
Tested with the following:
| #include <stdio.h>
| #include <stdbool.h>
|
| #define noinline __attribute__((noinline))
|
| static noinline void *adrp_self(void)
| {
| void *addr;
|
| asm volatile(
| " adrp %x0, adrp_self\n"
| " add %x0, %x0, :lo12:adrp_self\n"
| : "=r" (addr));
| }
|
|
| int main(int argc, char *argv)
| {
| void *ptr = adrp_self();
| bool equal = (ptr == adrp_self);
|
| printf("adrp_self => %p\n"
| "adrp_self() => %p\n"
| "%s\n",
| adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL");
|
| return 0;
| }
.... where the adrp_self() function was compiled to:
| 00000000004007e0 <adrp_self>:
| 4007e0: 90000000 adrp x0, 400000 <__ehdr_start>
| 4007e4: 911f8000 add x0, x0, #0x7e0
| 4007e8: d65f03c0 ret
Before this patch, the ADRP is not recognized, and is assumed to be
steppable, resulting in corruption of the result:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0xffffffffff7e0
| NOT EQUAL
After this patch, the ADRP is correctly recognized and simulated:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| #
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
Fixes: 9842ceae9fa8 ("arm64: Add uprobe support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
index 2b09495499c6..014b02897f8e 100644
--- a/arch/arm64/include/asm/uprobes.h
+++ b/arch/arm64/include/asm/uprobes.h
@@ -10,11 +10,9 @@
#include <asm/insn.h>
#include <asm/probes.h>
-#define MAX_UINSN_BYTES AARCH64_INSN_SIZE
-
#define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES)
#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE
-#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES
+#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE
typedef __le32 uprobe_opcode_t;
@@ -23,8 +21,8 @@ struct arch_uprobe_task {
struct arch_uprobe {
union {
- u8 insn[MAX_UINSN_BYTES];
- u8 ixol[MAX_UINSN_BYTES];
+ __le32 insn;
+ __le32 ixol;
};
struct arch_probe_insn api;
bool simulate;
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..a2f137a595fc 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE))
return -EINVAL;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
switch (arm_probe_decode_insn(insn, &auprobe->api)) {
case INSN_REJECTED:
@@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (!auprobe->simulate)
return false;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
addr = instruction_pointer(regs);
if (auprobe->api.handler)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101818-tying-implement-f714@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland(a)arm.com>
Date: Tue, 8 Oct 2024 16:58:48 +0100
Subject: [PATCH] arm64: probes: Fix uprobes for big-endian kernels
The arm64 uprobes code is broken for big-endian kernels as it doesn't
convert the in-memory instruction encoding (which is always
little-endian) into the kernel's native endianness before analyzing and
simulating instructions. This may result in a few distinct problems:
* The kernel may may erroneously reject probing an instruction which can
safely be probed.
* The kernel may erroneously erroneously permit stepping an
instruction out-of-line when that instruction cannot be stepped
out-of-line safely.
* The kernel may erroneously simulate instruction incorrectly dur to
interpretting the byte-swapped encoding.
The endianness mismatch isn't caught by the compiler or sparse because:
* The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so
the compiler and sparse have no idea these contain a little-endian
32-bit value. The core uprobes code populates these with a memcpy()
which similarly does not handle endianness.
* While the uprobe_opcode_t type is an alias for __le32, both
arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[]
to the similarly-named probe_opcode_t, which is an alias for u32.
Hence there is no endianness conversion warning.
Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and
adding the appropriate __le32_to_cpu() conversions prior to consuming
the instruction encoding. The core uprobes copies these fields as opaque
ranges of bytes, and so is unaffected by this change.
At the same time, remove MAX_UINSN_BYTES and consistently use
AARCH64_INSN_SIZE for clarity.
Tested with the following:
| #include <stdio.h>
| #include <stdbool.h>
|
| #define noinline __attribute__((noinline))
|
| static noinline void *adrp_self(void)
| {
| void *addr;
|
| asm volatile(
| " adrp %x0, adrp_self\n"
| " add %x0, %x0, :lo12:adrp_self\n"
| : "=r" (addr));
| }
|
|
| int main(int argc, char *argv)
| {
| void *ptr = adrp_self();
| bool equal = (ptr == adrp_self);
|
| printf("adrp_self => %p\n"
| "adrp_self() => %p\n"
| "%s\n",
| adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL");
|
| return 0;
| }
.... where the adrp_self() function was compiled to:
| 00000000004007e0 <adrp_self>:
| 4007e0: 90000000 adrp x0, 400000 <__ehdr_start>
| 4007e4: 911f8000 add x0, x0, #0x7e0
| 4007e8: d65f03c0 ret
Before this patch, the ADRP is not recognized, and is assumed to be
steppable, resulting in corruption of the result:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0xffffffffff7e0
| NOT EQUAL
After this patch, the ADRP is correctly recognized and simulated:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| #
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
Fixes: 9842ceae9fa8 ("arm64: Add uprobe support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
index 2b09495499c6..014b02897f8e 100644
--- a/arch/arm64/include/asm/uprobes.h
+++ b/arch/arm64/include/asm/uprobes.h
@@ -10,11 +10,9 @@
#include <asm/insn.h>
#include <asm/probes.h>
-#define MAX_UINSN_BYTES AARCH64_INSN_SIZE
-
#define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES)
#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE
-#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES
+#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE
typedef __le32 uprobe_opcode_t;
@@ -23,8 +21,8 @@ struct arch_uprobe_task {
struct arch_uprobe {
union {
- u8 insn[MAX_UINSN_BYTES];
- u8 ixol[MAX_UINSN_BYTES];
+ __le32 insn;
+ __le32 ixol;
};
struct arch_probe_insn api;
bool simulate;
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..a2f137a595fc 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE))
return -EINVAL;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
switch (arm_probe_decode_insn(insn, &auprobe->api)) {
case INSN_REJECTED:
@@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (!auprobe->simulate)
return false;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
addr = instruction_pointer(regs);
if (auprobe->api.handler)
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024101815-chapped-decibel-91ed@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 13f8f1e05f1dc36dbba6cba0ae03354c0dafcde7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland(a)arm.com>
Date: Tue, 8 Oct 2024 16:58:48 +0100
Subject: [PATCH] arm64: probes: Fix uprobes for big-endian kernels
The arm64 uprobes code is broken for big-endian kernels as it doesn't
convert the in-memory instruction encoding (which is always
little-endian) into the kernel's native endianness before analyzing and
simulating instructions. This may result in a few distinct problems:
* The kernel may may erroneously reject probing an instruction which can
safely be probed.
* The kernel may erroneously erroneously permit stepping an
instruction out-of-line when that instruction cannot be stepped
out-of-line safely.
* The kernel may erroneously simulate instruction incorrectly dur to
interpretting the byte-swapped encoding.
The endianness mismatch isn't caught by the compiler or sparse because:
* The arch_uprobe::{insn,ixol} fields are encoded as arrays of u8, so
the compiler and sparse have no idea these contain a little-endian
32-bit value. The core uprobes code populates these with a memcpy()
which similarly does not handle endianness.
* While the uprobe_opcode_t type is an alias for __le32, both
arch_uprobe_analyze_insn() and arch_uprobe_skip_sstep() cast from u8[]
to the similarly-named probe_opcode_t, which is an alias for u32.
Hence there is no endianness conversion warning.
Fix this by changing the arch_uprobe::{insn,ixol} fields to __le32 and
adding the appropriate __le32_to_cpu() conversions prior to consuming
the instruction encoding. The core uprobes copies these fields as opaque
ranges of bytes, and so is unaffected by this change.
At the same time, remove MAX_UINSN_BYTES and consistently use
AARCH64_INSN_SIZE for clarity.
Tested with the following:
| #include <stdio.h>
| #include <stdbool.h>
|
| #define noinline __attribute__((noinline))
|
| static noinline void *adrp_self(void)
| {
| void *addr;
|
| asm volatile(
| " adrp %x0, adrp_self\n"
| " add %x0, %x0, :lo12:adrp_self\n"
| : "=r" (addr));
| }
|
|
| int main(int argc, char *argv)
| {
| void *ptr = adrp_self();
| bool equal = (ptr == adrp_self);
|
| printf("adrp_self => %p\n"
| "adrp_self() => %p\n"
| "%s\n",
| adrp_self, ptr, equal ? "EQUAL" : "NOT EQUAL");
|
| return 0;
| }
.... where the adrp_self() function was compiled to:
| 00000000004007e0 <adrp_self>:
| 4007e0: 90000000 adrp x0, 400000 <__ehdr_start>
| 4007e4: 911f8000 add x0, x0, #0x7e0
| 4007e8: d65f03c0 ret
Before this patch, the ADRP is not recognized, and is assumed to be
steppable, resulting in corruption of the result:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0xffffffffff7e0
| NOT EQUAL
After this patch, the ADRP is correctly recognized and simulated:
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
| #
| # echo 'p /root/adrp-self:0x007e0' > /sys/kernel/tracing/uprobe_events
| # echo 1 > /sys/kernel/tracing/events/uprobes/enable
| # ./adrp-self
| adrp_self => 0x4007e0
| adrp_self() => 0x4007e0
| EQUAL
Fixes: 9842ceae9fa8 ("arm64: Add uprobe support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-4-mark.rutland@arm.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h
index 2b09495499c6..014b02897f8e 100644
--- a/arch/arm64/include/asm/uprobes.h
+++ b/arch/arm64/include/asm/uprobes.h
@@ -10,11 +10,9 @@
#include <asm/insn.h>
#include <asm/probes.h>
-#define MAX_UINSN_BYTES AARCH64_INSN_SIZE
-
#define UPROBE_SWBP_INSN cpu_to_le32(BRK64_OPCODE_UPROBES)
#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE
-#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES
+#define UPROBE_XOL_SLOT_BYTES AARCH64_INSN_SIZE
typedef __le32 uprobe_opcode_t;
@@ -23,8 +21,8 @@ struct arch_uprobe_task {
struct arch_uprobe {
union {
- u8 insn[MAX_UINSN_BYTES];
- u8 ixol[MAX_UINSN_BYTES];
+ __le32 insn;
+ __le32 ixol;
};
struct arch_probe_insn api;
bool simulate;
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..a2f137a595fc 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -42,7 +42,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE))
return -EINVAL;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
switch (arm_probe_decode_insn(insn, &auprobe->api)) {
case INSN_REJECTED:
@@ -108,7 +108,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (!auprobe->simulate)
return false;
- insn = *(probe_opcode_t *)(&auprobe->insn[0]);
+ insn = le32_to_cpu(auprobe->insn);
addr = instruction_pointer(regs);
if (auprobe->api.handler)
As detected by Coverity, the error check logic at get_ctrl() is
broken: if ptr_to_user() fails to fill a control due to an error,
no errors are returned and v4l2_g_ctrl() returns success on a
failed operation, which may cause applications to fail.
Add an error check at get_ctrl() and ensure that it will
be returned to userspace without filling the control value if
get_ctrl() fails.
Fixes: 71c689dc2e73 ("media: v4l2-ctrls: split up into four source files")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
---
drivers/media/v4l2-core/v4l2-ctrls-api.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c
index e5a364efd5e6..a0de7eeaf085 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-api.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c
@@ -753,9 +753,10 @@ static int get_ctrl(struct v4l2_ctrl *ctrl, struct v4l2_ext_control *c)
for (i = 0; i < master->ncontrols; i++)
cur_to_new(master->cluster[i]);
ret = call_op(master, g_volatile_ctrl);
- new_to_user(c, ctrl);
+ if (!ret)
+ ret = new_to_user(c, ctrl);
} else {
- cur_to_user(c, ctrl);
+ ret = cur_to_user(c, ctrl);
}
v4l2_ctrl_unlock(master);
return ret;
@@ -770,7 +771,10 @@ int v4l2_g_ctrl(struct v4l2_ctrl_handler *hdl, struct v4l2_control *control)
if (!ctrl || !ctrl->is_int)
return -EINVAL;
ret = get_ctrl(ctrl, &c);
- control->value = c.value;
+
+ if (!ret)
+ control->value = c.value;
+
return ret;
}
EXPORT_SYMBOL(v4l2_g_ctrl);
@@ -811,10 +815,12 @@ static int set_ctrl_lock(struct v4l2_fh *fh, struct v4l2_ctrl *ctrl,
int ret;
v4l2_ctrl_lock(ctrl);
- user_to_new(c, ctrl);
+ ret = user_to_new(c, ctrl);
+ if (ret)
+ return ret;
ret = set_ctrl(fh, ctrl, 0);
if (!ret)
- cur_to_user(c, ctrl);
+ ret = cur_to_user(c, ctrl);
v4l2_ctrl_unlock(ctrl);
return ret;
}
--
2.47.0
As reported by Coverity, the logic at tpg_precalculate_line()
blindly rescales the buffer even when scaled_witdh is equal to
zero. If this ever happens, this will cause a division by zero.
Instead, add a WARN_ON_ONCE() to trigger such cases and return
without doing any precalculation.
Fixes: 63881df94d3e ("[media] vivid: add the Test Pattern Generator")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
---
drivers/media/common/v4l2-tpg/v4l2-tpg-core.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
index c86343a4d0bf..940bfbf275ce 100644
--- a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
+++ b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
@@ -1795,6 +1795,9 @@ static void tpg_precalculate_line(struct tpg_data *tpg)
unsigned p;
unsigned x;
+ if (WARN_ON_ONCE(!tpg->src_width || !tpg->scaled_width))
+ return;
+
switch (tpg->pattern) {
case TPG_PAT_GREEN:
contrast = TPG_COLOR_100_RED;
--
2.47.0
In loongson_sysconf, The "core" of cores_per_node and cores_per_package
stands for a logical core, which means in a SMT system it stands for a
thread indeed. This information is gotten from SMBIOS Type4 Structure,
so in order to get a correct cores_per_package for both SMT and non-SMT
systems in parse_cpu_table() we should use SMBIOS_THREAD_PACKAGE_OFFSET
instead of SMBIOS_CORE_PACKAGE_OFFSET.
Cc: stable(a)vger.kernel.org
Reported-by: Chao Li <lichao(a)loongson.cn>
Tested-by: Chao Li <lichao(a)loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
arch/loongarch/kernel/setup.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 00e307203ddb..cbd3c09a93c1 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -55,6 +55,7 @@
#define SMBIOS_FREQHIGH_OFFSET 0x17
#define SMBIOS_FREQLOW_MASK 0xFF
#define SMBIOS_CORE_PACKAGE_OFFSET 0x23
+#define SMBIOS_THREAD_PACKAGE_OFFSET 0x25
#define LOONGSON_EFI_ENABLE (1 << 3)
unsigned long fw_arg0, fw_arg1, fw_arg2;
@@ -125,7 +126,7 @@ static void __init parse_cpu_table(const struct dmi_header *dm)
cpu_clock_freq = freq_temp * 1000000;
loongson_sysconf.cpuname = (void *)dmi_string_parse(dm, dmi_data[16]);
- loongson_sysconf.cores_per_package = *(dmi_data + SMBIOS_CORE_PACKAGE_OFFSET);
+ loongson_sysconf.cores_per_package = *(dmi_data + SMBIOS_THREAD_PACKAGE_OFFSET);
pr_info("CpuClock = %llu\n", cpu_clock_freq);
}
--
2.43.5
diff --git a/arch/loongarch/include/asm/bootinfo.h b/arch/loongarch/include/asm/bootinfo.h
index 6d5846dd075c..7657e016233f 100644
--- a/arch/loongarch/include/asm/bootinfo.h
+++ b/arch/loongarch/include/asm/bootinfo.h
@@ -26,6 +26,10 @@ struct loongson_board_info {
#define NR_WORDS DIV_ROUND_UP(NR_CPUS, BITS_PER_LONG)
+/*
+ * The "core" of cores_per_node and cores_per_package stands for a
+ * logical core, which means in a SMT system it stands for a thread.
+ */
struct loongson_system_configuration {
int nr_cpus;
int nr_nodes;
Hi, Conor
> On Thu, Oct 17, 2024 at 05:49:56AM +0000, Changhuang Liang wrote:
> > Hi, Conor,
> >
> > > Hi, Conor
> > >
> > > Thanks for your patch.
> > >
> > > > From: Conor Dooley <conor.dooley(a)microchip.com>
> > > >
> > > > Aurelien reported probe failures due to the csi node being enabled
> > > > without having a camera attached to it. A camera was in the
> > > > initial submissions, but was removed from the dts, as it had not
> > > > actually been present on the board, but was from an addon board
> > > > used by the developer
> > > of the relevant drivers.
> > > > The non-camera pipeline nodes were not disabled when this happened
> > > > and the probe failures are problematic for Debian. Disable them.
> > > >
> > > > CC: stable(a)vger.kernel.org
> > > > Fixes: 28ecaaa5af192 ("riscv: dts: starfive: jh7110: Add camera
> > > > subsystem
> > > > nodes")
> > >
> > > Here you write it in 13 characters, should be "Fixes: 28ecaaa5af19 ..."
> > >
> >
> > After fixing this:
> > Reviewed-by: Changhuang Liang <changhuang.liang(a)starfivetech.com>
>
> Ye, I know it was 13 not 12. I don't think that's a problem though.
Okay, that's fine.
Best Regards,
Changhuang
The patch titled
Subject: resource,kexec: walk_system_ram_res_rev must retain resource flags
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
resourcekexec-walk_system_ram_res_rev-must-retain-resource-flags.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gregory Price <gourry(a)gourry.net>
Subject: resource,kexec: walk_system_ram_res_rev must retain resource flags
Date: Thu, 17 Oct 2024 15:03:47 -0400
walk_system_ram_res_rev() erroneously discards resource flags when passing
the information to the callback.
This causes systems with IORESOURCE_SYSRAM_DRIVER_MANAGED memory to have
these resources selected during kexec to store kexec buffers if that
memory happens to be at placed above normal system ram.
This leads to undefined behavior after reboot. If the kexec buffer is
never touched, nothing happens. If the kexec buffer is touched, it could
lead to a crash (like below) or undefined behavior.
Tested on a system with CXL memory expanders with driver managed memory,
TPM enabled, and CONFIG_IMA_KEXEC=y. Adding printk's showed the flags
were being discarded and as a result the check for
IORESOURCE_SYSRAM_DRIVER_MANAGED passes.
find_next_iomem_res: name(System RAM (kmem))
start(10000000000)
end(1034fffffff)
flags(83000200)
locate_mem_hole_top_down: start(10000000000) end(1034fffffff) flags(0)
[.] BUG: unable to handle page fault for address: ffff89834ffff000
[.] #PF: supervisor read access in kernel mode
[.] #PF: error_code(0x0000) - not-present page
[.] PGD c04c8bf067 P4D c04c8bf067 PUD c04c8be067 PMD 0
[.] Oops: 0000 [#1] SMP
[.] RIP: 0010:ima_restore_measurement_list+0x95/0x4b0
[.] RSP: 0018:ffffc900000d3a80 EFLAGS: 00010286
[.] RAX: 0000000000001000 RBX: 0000000000000000 RCX: ffff89834ffff000
[.] RDX: 0000000000000018 RSI: ffff89834ffff000 RDI: ffff89834ffff018
[.] RBP: ffffc900000d3ba0 R08: 0000000000000020 R09: ffff888132b8a900
[.] R10: 4000000000000000 R11: 000000003a616d69 R12: 0000000000000000
[.] R13: ffffffff8404ac28 R14: 0000000000000000 R15: ffff89834ffff000
[.] FS: 0000000000000000(0000) GS:ffff893d44640000(0000) knlGS:0000000000000000
[.] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[.] ata5: SATA link down (SStatus 0 SControl 300)
[.] CR2: ffff89834ffff000 CR3: 000001034d00f001 CR4: 0000000000770ef0
[.] PKRU: 55555554
[.] Call Trace:
[.] <TASK>
[.] ? __die+0x78/0xc0
[.] ? page_fault_oops+0x2a8/0x3a0
[.] ? exc_page_fault+0x84/0x130
[.] ? asm_exc_page_fault+0x22/0x30
[.] ? ima_restore_measurement_list+0x95/0x4b0
[.] ? template_desc_init_fields+0x317/0x410
[.] ? crypto_alloc_tfm_node+0x9c/0xc0
[.] ? init_ima_lsm+0x30/0x30
[.] ima_load_kexec_buffer+0x72/0xa0
[.] ima_init+0x44/0xa0
[.] __initstub__kmod_ima__373_1201_init_ima7+0x1e/0xb0
[.] ? init_ima_lsm+0x30/0x30
[.] do_one_initcall+0xad/0x200
[.] ? idr_alloc_cyclic+0xaa/0x110
[.] ? new_slab+0x12c/0x420
[.] ? new_slab+0x12c/0x420
[.] ? number+0x12a/0x430
[.] ? sysvec_apic_timer_interrupt+0xa/0x80
[.] ? asm_sysvec_apic_timer_interrupt+0x16/0x20
[.] ? parse_args+0xd4/0x380
[.] ? parse_args+0x14b/0x380
[.] kernel_init_freeable+0x1c1/0x2b0
[.] ? rest_init+0xb0/0xb0
[.] kernel_init+0x16/0x1a0
[.] ret_from_fork+0x2f/0x40
[.] ? rest_init+0xb0/0xb0
[.] ret_from_fork_asm+0x11/0x20
[.] </TASK>
Link: https://lore.kernel.org/all/20231114091658.228030-1-bhe@redhat.com/
Link: https://lkml.kernel.org/r/20241017190347.5578-1-gourry@gourry.net
Fixes: 7acf164b259d ("resource: add walk_system_ram_res_rev()")
Signed-off-by: Gregory Price <gourry(a)gourry.net>
Cc: AKASHI Takahiro <takahiro.akashi(a)linaro.org>
Cc: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Cc: Baoquan he <bhe(a)redhat.com>
Cc: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Ilpo J��rvinen <ilpo.jarvinen(a)linux.intel.com>
Cc: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/resource.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
--- a/kernel/resource.c~resourcekexec-walk_system_ram_res_rev-must-retain-resource-flags
+++ a/kernel/resource.c
@@ -459,9 +459,7 @@ int walk_system_ram_res_rev(u64 start, u
rams_size += 16;
}
- rams[i].start = res.start;
- rams[i++].end = res.end;
-
+ rams[i++] = res;
start = res.end + 1;
}
_
Patches currently in -mm which might be from gourry(a)gourry.net are
resourcekexec-walk_system_ram_res_rev-must-retain-resource-flags.patch
The patch titled
Subject: nilfs2: fix kernel bug due to missing clearing of checked flag
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
nilfs2-fix-kernel-bug-due-to-missing-clearing-of-checked-flag.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: fix kernel bug due to missing clearing of checked flag
Date: Fri, 18 Oct 2024 04:33:10 +0900
Syzbot reported that in directory operations after nilfs2 detects
filesystem corruption and degrades to read-only,
__block_write_begin_int(), which is called to prepare block writes, may
fail the BUG_ON check for accesses exceeding the folio/page size,
triggering a kernel bug.
This was found to be because the "checked" flag of a page/folio was not
cleared when it was discarded by nilfs2's own routine, which causes the
sanity check of directory entries to be skipped when the directory
page/folio is reloaded. So, fix that.
This was necessary when the use of nilfs2's own page discard routine was
applied to more than just metadata files.
Link: https://lkml.kernel.org/r/20241017193359.5051-1-konishi.ryusuke@gmail.com
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d6ca2daf692c7a82f959(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d6ca2daf692c7a82f959
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/page.c | 1 +
1 file changed, 1 insertion(+)
--- a/fs/nilfs2/page.c~nilfs2-fix-kernel-bug-due-to-missing-clearing-of-checked-flag
+++ a/fs/nilfs2/page.c
@@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct foli
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
nilfs2-fix-kernel-bug-due-to-missing-clearing-of-buffer-delay-flag.patch
nilfs2-fix-kernel-bug-due-to-missing-clearing-of-checked-flag.patch
If some remap_pfn_range() calls succeeded before one failed, we still have
buffer pages mapped into the userspace page tables when we drop the buffer
reference with comedi_buf_map_put(bm). The userspace mappings are only
cleaned up later in the mmap error path.
Fix it by explicitly flushing all mappings in our VMA on the error path.
See commit 79a61cc3fc04 ("mm: avoid leaving partial pfn mappings around in
error case").
Cc: stable(a)vger.kernel.org
Fixes: ed9eccbe8970 ("Staging: add comedi core")
Signed-off-by: Jann Horn <jannh(a)google.com>
---
Note: compile-tested only; I don't actually have comedi hardware, and I
don't know anything about comedi.
---
Changes in v2:
- only do the zapping in the pfnmap path (Ian Abbott)
- use zap_vma_ptes() instead of zap_page_range_single() (Ian Abbott)
- Link to v1: https://lore.kernel.org/r/20241014-comedi-tlb-v1-1-4b699144b438@google.com
---
drivers/comedi/comedi_fops.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c
index 1b481731df96..68e5301e6281 100644
--- a/drivers/comedi/comedi_fops.c
+++ b/drivers/comedi/comedi_fops.c
@@ -2407,6 +2407,16 @@ static int comedi_mmap(struct file *file, struct vm_area_struct *vma)
start += PAGE_SIZE;
}
+
+ /*
+ * Leaving behind a partial mapping of a buffer we're about to
+ * drop is unsafe, see remap_pfn_range_notrack().
+ * We need to zap the range here ourselves instead of relying
+ * on the automatic zapping in remap_pfn_range() because we call
+ * remap_pfn_range() in a loop.
+ */
+ if (retval)
+ zap_vma_ptes(vma, vma->vm_start, size);
}
if (retval == 0) {
---
base-commit: 6485cf5ea253d40d507cd71253c9568c5470cd27
change-id: 20241014-comedi-tlb-400246505961
--
Jann Horn <jannh(a)google.com>
From: Mikulas Patocka <mpatocka(a)redhat.com>
commit 0a9bab391e336489169b95cb0d4553d921302189 upstream.
Tasklets have an inherent problem with memory corruption. The function
tasklet_action_common calls tasklet_trylock, then it calls the tasklet
callback and then it calls tasklet_unlock. If the tasklet callback frees
the structure that contains the tasklet or if it calls some code that may
free it, tasklet_unlock will write into free memory.
The commits 8e14f610159d and d9a02e016aaf try to fix it for dm-crypt, but
it is not a sufficient fix and the data corruption can still happen [1].
There is no fix for dm-verity and dm-verity will write into free memory
with every tasklet-processed bio.
There will be atomic workqueues implemented in the kernel 6.9 [2]. They
will have better interface and they will not suffer from the memory
corruption problem.
But we need something that stops the memory corruption now and that can be
backported to the stable kernels. So, I'm proposing this commit that
disables tasklets in both dm-crypt and dm-verity. This commit doesn't
remove the tasklet support, because the tasklet code will be reused when
atomic workqueues will be implemented.
[1] https://lore.kernel.org/all/d390d7ee-f142-44d3-822a-87949e14608b@suse.de/T/
[2] https://lore.kernel.org/lkml/20240130091300.2968534-1-tj@kernel.org/
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: 39d42fa96ba1b ("dm crypt: add flags to optionally bypass kcryptd workqueues")
Fixes: 5721d4e5a9cdb ("dm verity: Add optional "try_verify_in_tasklet" feature")
Signed-off-by: Mike Snitzer <snitzer(a)kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
(cherry picked from commit 30884a44e0cedc3dfda8c22432f3ba4078ec2d94)
Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
---
drivers/md/dm-crypt.c | 37 ++-----------------------------------
1 file changed, 2 insertions(+), 35 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9889035c343e3..95b3b69a5e3c4 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -69,10 +69,8 @@ struct dm_crypt_io {
struct bio *base_bio;
u8 *integrity_metadata;
bool integrity_metadata_from_pool:1;
- bool in_tasklet:1;
struct work_struct work;
- struct tasklet_struct tasklet;
struct convert_context ctx;
@@ -1769,7 +1767,6 @@ static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc,
io->ctx.r.req = NULL;
io->integrity_metadata = NULL;
io->integrity_metadata_from_pool = false;
- io->in_tasklet = false;
atomic_set(&io->io_pending, 0);
}
@@ -1778,12 +1775,6 @@ static void crypt_inc_pending(struct dm_crypt_io *io)
atomic_inc(&io->io_pending);
}
-static void kcryptd_io_bio_endio(struct work_struct *work)
-{
- struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
- bio_endio(io->base_bio);
-}
-
/*
* One of the bios was finished. Check for completion of
* the whole request and correctly clean up the buffer.
@@ -1807,20 +1798,6 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
base_bio->bi_status = error;
- /*
- * If we are running this function from our tasklet,
- * we can't call bio_endio() here, because it will call
- * clone_endio() from dm.c, which in turn will
- * free the current struct dm_crypt_io structure with
- * our tasklet. In this case we need to delay bio_endio()
- * execution to after the tasklet is done and dequeued.
- */
- if (io->in_tasklet) {
- INIT_WORK(&io->work, kcryptd_io_bio_endio);
- queue_work(cc->io_queue, &io->work);
- return;
- }
-
bio_endio(base_bio);
}
@@ -2264,11 +2241,6 @@ static void kcryptd_crypt(struct work_struct *work)
kcryptd_crypt_write_convert(io);
}
-static void kcryptd_crypt_tasklet(unsigned long work)
-{
- kcryptd_crypt((struct work_struct *)work);
-}
-
static void kcryptd_queue_crypt(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->cc;
@@ -2280,15 +2252,10 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io)
* irqs_disabled(): the kernel may run some IO completion from the idle thread, but
* it is being executed with irqs disabled.
*/
- if (in_hardirq() || irqs_disabled()) {
- io->in_tasklet = true;
- tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work);
- tasklet_schedule(&io->tasklet);
+ if (!(in_hardirq() || irqs_disabled())) {
+ kcryptd_crypt(&io->work);
return;
}
-
- kcryptd_crypt(&io->work);
- return;
}
INIT_WORK(&io->work, kcryptd_crypt);
--
2.46.0
If there is an error during some initialization related to firmware,
the buffers dp->tx_ring[i].tx_status are released.
However this is released again when the device is unbinded (ath12k_pci),
and we get:
WARNING: CPU: 0 PID: 2098 at mm/slub.c:4689 free_large_kmalloc+0x4d/0x80
Call Trace:
free_large_kmalloc
ath12k_dp_free
ath12k_core_deinit
ath12k_pci_remove
...
The issue is always reproducible from a VM because the MSI addressing
initialization is failing.
In order to fix the issue, just set the buffers to NULL after releasing in
order to avoid the double free.
cc: stable(a)vger.kernel.org
Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
---
v4:
- Send with cover letter to get reference with 1/2
v3: https://lore.kernel.org/linux-wireless/20241017074854.176765-1-jtornosm@red…
v2: https://lore.kernel.org/linux-wireless/20241016123722.206899-1-jtornosm@red…
v1: https://lore.kernel.org/linux-wireless/20241010175102.207324-3-jtornosm@red…
drivers/net/wireless/ath/ath12k/dp.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c
index 789d430e4455..15061782a2df 100644
--- a/drivers/net/wireless/ath/ath12k/dp.c
+++ b/drivers/net/wireless/ath/ath12k/dp.c
@@ -1277,8 +1277,10 @@ void ath12k_dp_free(struct ath12k_base *ab)
ath12k_dp_rx_reo_cmd_list_cleanup(ab);
- for (i = 0; i < ab->hw_params->max_tx_ring; i++)
+ for (i = 0; i < ab->hw_params->max_tx_ring; i++) {
kfree(dp->tx_ring[i].tx_status);
+ dp->tx_ring[i].tx_status = NULL;
+ }
ath12k_dp_rx_free(ab);
/* Deinit any SOC level resource */
--
2.47.0
If there is an error during some initialization related to firmware,
the function ath12k_dp_cc_cleanup is called to release resources.
However this is released again when the device is unbinded (ath12k_pci),
and we get:
BUG: kernel NULL pointer dereference, address: 0000000000000020
at RIP: 0010:ath12k_dp_cc_cleanup.part.0+0xb6/0x500 [ath12k]
Call Trace:
ath12k_dp_cc_cleanup
ath12k_dp_free
ath12k_core_deinit
ath12k_pci_remove
...
The issue is always reproducible from a VM because the MSI addressing
initialization is failing.
In order to fix the issue, just set to NULL the released structure in
ath12k_dp_cc_cleanup at the end.
cc: stable(a)vger.kernel.org
Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
---
v4:
- Send with cover letter to get reference with 2/2
v3: https://lore.kernel.org/linux-wireless/20241017074654.176678-1-jtornosm@red…
v2: https://lore.kernel.org/linux-wireless/20241016123452.206671-1-jtornosm@red…
v1: https://lore.kernel.org/linux-wireless/20241010175102.207324-2-jtornosm@red…
drivers/net/wireless/ath/ath12k/dp.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c
index 61aa78d8bd8c..789d430e4455 100644
--- a/drivers/net/wireless/ath/ath12k/dp.c
+++ b/drivers/net/wireless/ath/ath12k/dp.c
@@ -1241,6 +1241,7 @@ static void ath12k_dp_cc_cleanup(struct ath12k_base *ab)
}
kfree(dp->spt_info);
+ dp->spt_info = NULL;
}
static void ath12k_dp_reoq_lut_cleanup(struct ath12k_base *ab)
--
2.46.2
Hi, Conor,
> Hi, Conor
>
> Thanks for your patch.
>
> > From: Conor Dooley <conor.dooley(a)microchip.com>
> >
> > Aurelien reported probe failures due to the csi node being enabled
> > without having a camera attached to it. A camera was in the initial
> > submissions, but was removed from the dts, as it had not actually been
> > present on the board, but was from an addon board used by the developer
> of the relevant drivers.
> > The non-camera pipeline nodes were not disabled when this happened and
> > the probe failures are problematic for Debian. Disable them.
> >
> > CC: stable(a)vger.kernel.org
> > Fixes: 28ecaaa5af192 ("riscv: dts: starfive: jh7110: Add camera
> > subsystem
> > nodes")
>
> Here you write it in 13 characters, should be "Fixes: 28ecaaa5af19 ..."
>
After fixing this:
Reviewed-by: Changhuang Liang <changhuang.liang(a)starfivetech.com>
It seems it is necessary to set WILC MAC address after operation mode,
otherwise the MAC address of the WILC MAC is reset back to what is in
nvmem. This causes a failure to associate with AP after the WILC MAC
address was overridden by userspace.
Test case:
"
ap$ cat << EOF > hostap.conf
interface=wlan0
ssid=ssid
hw_mode=g
channel=6
wpa=2
wpa_passphrase=pass
wpa_key_mgmt=WPA-PSK
EOF
ap$ hostapd -d hostap.conf
ap$ ifconfig wlan0 10.0.0.1
"
"
sta$ ifconfig wlan0 hw ether 00:11:22:33:44:55
sta$ wpa_supplicant -i wlan0 -c <(wpa_passphrase ssid pass)
sta$ ifconfig wlan0 10.0.0.2
sta$ ping 10.0.0.1 # fails without this patch
"
AP still indicates SA with original MAC address from nvmem without this patch:
"
nl80211: RX frame da=ff:ff:ff:ff:ff:ff sa=60:01:23:45:67:89 bssid=ff:ff:ff:ff:ff:ff ...
^^^^^^^^^^^^^^^^^
"
Fixes: 83d9b54ee5d4 ("wifi: wilc1000: read MAC address from fuse at probe")
Tested-by: Alexis Lothoré <alexis.lothore(a)bootlin.com>
Signed-off-by: Marek Vasut <marex(a)denx.de>
---
Cc: Ajay Singh <ajay.kathat(a)microchip.com>
Cc: Alexis Lothoré <alexis.lothore(a)bootlin.com>
Cc: Claudiu Beznea <claudiu.beznea(a)tuxon.dev>
Cc: Kalle Valo <kvalo(a)kernel.org>
Cc: Marek Vasut <marex(a)denx.de>
Cc: linux-wireless(a)vger.kernel.org
Cc: stable(a)vger.kernel.org
---
V2: Add TB, Fixes, CC Stable
---
drivers/net/wireless/microchip/wilc1000/netdev.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/wireless/microchip/wilc1000/netdev.c b/drivers/net/wireless/microchip/wilc1000/netdev.c
index 9ecf3fb29b558..8bc127c5a538c 100644
--- a/drivers/net/wireless/microchip/wilc1000/netdev.c
+++ b/drivers/net/wireless/microchip/wilc1000/netdev.c
@@ -608,6 +608,9 @@ static int wilc_mac_open(struct net_device *ndev)
return ret;
}
+ wilc_set_operation_mode(vif, wilc_get_vif_idx(vif), vif->iftype,
+ vif->idx);
+
netdev_dbg(ndev, "Mac address: %pM\n", ndev->dev_addr);
ret = wilc_set_mac_address(vif, ndev->dev_addr);
if (ret) {
@@ -618,9 +621,6 @@ static int wilc_mac_open(struct net_device *ndev)
return ret;
}
- wilc_set_operation_mode(vif, wilc_get_vif_idx(vif), vif->iftype,
- vif->idx);
-
mgmt_regs.interface_stypes = vif->mgmt_reg_stypes;
/* so we detect a change */
vif->mgmt_reg_stypes = 0;
--
2.45.2
From: Conor Dooley <conor.dooley(a)microchip.com>
Aurelien reported probe failures due to the csi node being enabled
without having a camera attached to it. A camera was in the initial
submissions, but was removed from the dts, as it had not actually been
present on the board, but was from an addon board used by the
developer of the relevant drivers. The non-camera pipeline nodes were
not disabled when this happened and the probe failures are problematic
for Debian. Disable them.
CC: stable(a)vger.kernel.org
Fixes: 28ecaaa5af192 ("riscv: dts: starfive: jh7110: Add camera subsystem nodes")
Closes: https://lore.kernel.org/all/Zw1-vcN4CoVkfLjU@aurel32.net/
Reported-by: Aurelien Jarno <aurelien(a)aurel32.net>
Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
---
CC: Emil Renner Berthing <kernel(a)esmil.dk>
CC: Rob Herring <robh(a)kernel.org>
CC: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
CC: Conor Dooley <conor+dt(a)kernel.org>
CC: Changhuang Liang <changhuang.liang(a)starfivetech.com>
CC: devicetree(a)vger.kernel.org
CC: linux-riscv(a)lists.infradead.org
CC: linux-kernel(a)vger.kernel.org
---
arch/riscv/boot/dts/starfive/jh7110-common.dtsi | 2 --
1 file changed, 2 deletions(-)
diff --git a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
index c7771b3b64758..d6c55f1cc96a9 100644
--- a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
+++ b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
@@ -128,7 +128,6 @@ &camss {
assigned-clocks = <&ispcrg JH7110_ISPCLK_DOM4_APB_FUNC>,
<&ispcrg JH7110_ISPCLK_MIPI_RX0_PXL>;
assigned-clock-rates = <49500000>, <198000000>;
- status = "okay";
ports {
#address-cells = <1>;
@@ -151,7 +150,6 @@ camss_from_csi2rx: endpoint {
&csi2rx {
assigned-clocks = <&ispcrg JH7110_ISPCLK_VIN_SYS>;
assigned-clock-rates = <297000000>;
- status = "okay";
ports {
#address-cells = <1>;
--
2.45.2
The IMAGE_DLLCHARACTERISTICS_NX_COMPAT informs the firmware that the
EFI binary does not rely on pages that are both executable and
writable.
The flag is used by some distro versions of GRUB to decide if the EFI
binary may be executed.
As the Linux kernel neither has RWX sections nor needs RWX pages for
relocation we should set the flag.
Cc: Ard Biesheuvel <ardb(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Heinrich Schuchardt <heinrich.schuchardt(a)canonical.com>
---
arch/riscv/kernel/efi-header.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S
index 515b2dfbca75..c5f17c2710b5 100644
--- a/arch/riscv/kernel/efi-header.S
+++ b/arch/riscv/kernel/efi-header.S
@@ -64,7 +64,7 @@ extra_header_fields:
.long efi_header_end - _start // SizeOfHeaders
.long 0 // CheckSum
.short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem
- .short 0 // DllCharacteristics
+ .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics
.quad 0 // SizeOfStackReserve
.quad 0 // SizeOfStackCommit
.quad 0 // SizeOfHeapReserve
--
2.45.2
The existing code moves VF to the same namespace as the synthetic NIC
during netvsc_register_vf(). But, if the synthetic device is moved to a
new namespace after the VF registration, the VF won't be moved together.
To make the behavior more consistent, add a namespace check for synthetic
NIC's NETDEV_REGISTER event (generated during its move), and move the VF
if it is not in the same namespace.
Cc: stable(a)vger.kernel.org
Fixes: c0a41b887ce6 ("hv_netvsc: move VF to same namespace as netvsc device")
Suggested-by: Stephen Hemminger <stephen(a)networkplumber.org>
Signed-off-by: Haiyang Zhang <haiyangz(a)microsoft.com>
---
v2: Move my fix to synthetic NIC's NETDEV_REGISTER event as suggested by Stephen.
---
drivers/net/hyperv/netvsc_drv.c | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 153b97f8ec0d..54e98356ee93 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2798,6 +2798,30 @@ static struct hv_driver netvsc_drv = {
},
};
+/* Set VF's namespace same as the synthetic NIC */
+static void netvsc_event_set_vf_ns(struct net_device *ndev)
+{
+ struct net_device_context *ndev_ctx = netdev_priv(ndev);
+ struct net_device *vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+ int ret;
+
+ if (!vf_netdev)
+ return;
+
+ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) {
+ ret = dev_change_net_namespace(vf_netdev, dev_net(ndev),
+ "eth%d");
+ if (ret)
+ netdev_err(vf_netdev,
+ "Cannot move to same namespace as %s: %d\n",
+ ndev->name, ret);
+ else
+ netdev_info(vf_netdev,
+ "Moved VF to namespace with: %s\n",
+ ndev->name);
+ }
+}
+
/*
* On Hyper-V, every VF interface is matched with a corresponding
* synthetic interface. The synthetic interface is presented first
@@ -2810,6 +2834,11 @@ static int netvsc_netdev_event(struct notifier_block *this,
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
int ret = 0;
+ if (event_dev->netdev_ops == &device_ops && event == NETDEV_REGISTER) {
+ netvsc_event_set_vf_ns(event_dev);
+ return NOTIFY_DONE;
+ }
+
ret = check_dev_is_matching_vf(event_dev);
if (ret != 0)
return NOTIFY_DONE;
--
2.34.1
The quilt patch titled
Subject: maple_tree: correct tree corruption on spanning store
has been removed from the -mm tree. Its filename was
maple_tree-correct-tree-corruption-on-spanning-store.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: maple_tree: correct tree corruption on spanning store
Date: Mon, 7 Oct 2024 16:28:32 +0100
Patch series "maple_tree: correct tree corruption on spanning store", v3.
There has been a nasty yet subtle maple tree corruption bug that appears
to have been in existence since the inception of the algorithm.
This bug seems far more likely to happen since commit f8d112a4e657
("mm/mmap: avoid zeroing vma tree in mmap_region()"), which is the point
at which reports started to be submitted concerning this bug.
We were made definitely aware of the bug thanks to the kind efforts of
Bert Karwatzki who helped enormously in my being able to track this down
and identify the cause of it.
The bug arises when an attempt is made to perform a spanning store across
two leaf nodes, where the right leaf node is the rightmost child of the
shared parent, AND the store completely consumes the right-mode node.
This results in mas_wr_spanning_store() mitakenly duplicating the new and
existing entries at the maximum pivot within the range, and thus maple
tree corruption.
The fix patch corrects this by detecting this scenario and disallowing the
mistaken duplicate copy.
The fix patch commit message goes into great detail as to how this occurs.
This series also includes a test which reliably reproduces the issue, and
asserts that the fix works correctly.
Bert has kindly tested the fix and confirmed it resolved his issues. Also
Mikhail Gavrilov kindly reported what appears to be precisely the same
bug, which this fix should also resolve.
This patch (of 2):
There has been a subtle bug present in the maple tree implementation from
its inception.
This arises from how stores are performed - when a store occurs, it will
overwrite overlapping ranges and adjust the tree as necessary to
accommodate this.
A range may always ultimately span two leaf nodes. In this instance we
walk the two leaf nodes, determine which elements are not overwritten to
the left and to the right of the start and end of the ranges respectively
and then rebalance the tree to contain these entries and the newly
inserted one.
This kind of store is dubbed a 'spanning store' and is implemented by
mas_wr_spanning_store().
In order to reach this stage, mas_store_gfp() invokes
mas_wr_preallocate(), mas_wr_store_type() and mas_wr_walk() in turn to
walk the tree and update the object (mas) to traverse to the location
where the write should be performed, determining its store type.
When a spanning store is required, this function returns false stopping at
the parent node which contains the target range, and mas_wr_store_type()
marks the mas->store_type as wr_spanning_store to denote this fact.
When we go to perform the store in mas_wr_spanning_store(), we first
determine the elements AFTER the END of the range we wish to store (that
is, to the right of the entry to be inserted) - we do this by walking to
the NEXT pivot in the tree (i.e. r_mas.last + 1), starting at the node we
have just determined contains the range over which we intend to write.
We then turn our attention to the entries to the left of the entry we are
inserting, whose state is represented by l_mas, and copy these into a 'big
node', which is a special node which contains enough slots to contain two
leaf node's worth of data.
We then copy the entry we wish to store immediately after this - the copy
and the insertion of the new entry is performed by mas_store_b_node().
After this we copy the elements to the right of the end of the range which
we are inserting, if we have not exceeded the length of the node (i.e.
r_mas.offset <= r_mas.end).
Herein lies the bug - under very specific circumstances, this logic can
break and corrupt the maple tree.
Consider the following tree:
Height
0 Root Node
/ \
pivot = 0xffff / \ pivot = ULONG_MAX
/ \
1 A [-----] ...
/ \
pivot = 0x4fff / \ pivot = 0xffff
/ \
2 (LEAVES) B [-----] [-----] C
^--- Last pivot 0xffff.
Now imagine we wish to store an entry in the range [0x4000, 0xffff] (note
that all ranges expressed in maple tree code are inclusive):
1. mas_store_gfp() descends the tree, finds node A at <=0xffff, then
determines that this is a spanning store across nodes B and C. The mas
state is set such that the current node from which we traverse further
is node A.
2. In mas_wr_spanning_store() we try to find elements to the right of pivot
0xffff by searching for an index of 0x10000:
- mas_wr_walk_index() invokes mas_wr_walk_descend() and
mas_wr_node_walk() in turn.
- mas_wr_node_walk() loops over entries in node A until EITHER it
finds an entry whose pivot equals or exceeds 0x10000 OR it
reaches the final entry.
- Since no entry has a pivot equal to or exceeding 0x10000, pivot
0xffff is selected, leading to node C.
- mas_wr_walk_traverse() resets the mas state to traverse node C. We
loop around and invoke mas_wr_walk_descend() and mas_wr_node_walk()
in turn once again.
- Again, we reach the last entry in node C, which has a pivot of
0xffff.
3. We then copy the elements to the left of 0x4000 in node B to the big
node via mas_store_b_node(), and insert the new [0x4000, 0xffff] entry
too.
4. We determine whether we have any entries to copy from the right of the
end of the range via - and with r_mas set up at the entry at pivot
0xffff, r_mas.offset <= r_mas.end, and then we DUPLICATE the entry at
pivot 0xffff.
5. BUG! The maple tree is corrupted with a duplicate entry.
This requires a very specific set of circumstances - we must be spanning
the last element in a leaf node, which is the last element in the parent
node.
spanning store across two leaf nodes with a range that ends at that shared
pivot.
A potential solution to this problem would simply be to reset the walk
each time we traverse r_mas, however given the rarity of this situation it
seems that would be rather inefficient.
Instead, this patch detects if the right hand node is populated, i.e. has
anything we need to copy.
We do so by only copying elements from the right of the entry being
inserted when the maximum value present exceeds the last, rather than
basing this on offset position.
The patch also updates some comments and eliminates the unused bool return
value in mas_wr_walk_index().
The work performed in commit f8d112a4e657 ("mm/mmap: avoid zeroing vma
tree in mmap_region()") seems to have made the probability of this event
much more likely, which is the point at which reports started to be
submitted concerning this bug.
The motivation for this change arose from Bert Karwatzki's report of
encountering mm instability after the release of kernel v6.12-rc1 which,
after the use of CONFIG_DEBUG_VM_MAPLE_TREE and similar configuration
options, was identified as maple tree corruption.
After Bert very generously provided his time and ability to reproduce this
event consistently, I was able to finally identify that the issue
discussed in this commit message was occurring for him.
Link: https://lkml.kernel.org/r/cover.1728314402.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/48b349a2a0f7c76e18772712d0997a5e12ab0a3b.17283144…
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Bert Karwatzki <spasswolf(a)web.de>
Closes: https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/
Tested-by: Bert Karwatzki <spasswolf(a)web.de>
Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
Closes: https://lore.kernel.org/all/CABXGCsOPwuoNOqSMmAvWO2Fz4TEmPnjFj-b7iF+XFRu1h7…
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)Oracle.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov(a)gmail.com>
Reviewed-by: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Sidhartha Kumar <sidhartha.kumar(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/maple_tree.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/lib/maple_tree.c~maple_tree-correct-tree-corruption-on-spanning-store
+++ a/lib/maple_tree.c
@@ -2196,6 +2196,8 @@ static inline void mas_node_or_none(stru
/*
* mas_wr_node_walk() - Find the correct offset for the index in the @mas.
+ * If @mas->index cannot be found within the containing
+ * node, we traverse to the last entry in the node.
* @wr_mas: The maple write state
*
* Uses mas_slot_locked() and does not need to worry about dead nodes.
@@ -3532,7 +3534,7 @@ static bool mas_wr_walk(struct ma_wr_sta
return true;
}
-static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
+static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
struct ma_state *mas = wr_mas->mas;
@@ -3541,11 +3543,9 @@ static bool mas_wr_walk_index(struct ma_
wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
mas->offset);
if (ma_is_leaf(wr_mas->type))
- return true;
+ return;
mas_wr_walk_traverse(wr_mas);
-
}
- return true;
}
/*
* mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
@@ -3765,8 +3765,8 @@ static noinline void mas_wr_spanning_sto
memset(&b_node, 0, sizeof(struct maple_big_node));
/* Copy l_mas and store the value in b_node. */
mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
- /* Copy r_mas into b_node. */
- if (r_mas.offset <= r_mas.end)
+ /* Copy r_mas into b_node if there is anything to copy. */
+ if (r_mas.max > r_mas.last)
mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
&b_node, b_node.b_end + 1);
else
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
fork-do-not-invoke-uffd-on-fork-if-error-occurs.patch
fork-only-invoke-khugepaged-ksm-hooks-if-no-error.patch
selftests-mm-add-pkey_sighandler_xx-hugetlb_dio-to-gitignore.patch
mm-refactor-mm_access-to-not-return-null.patch
mm-refactor-mm_access-to-not-return-null-fix.patch
mm-madvise-unrestrict-process_madvise-for-current-process.patch
maple_tree-do-not-hash-pointers-on-dump-in-debug-mode.patch
misc_minor_alloc was allocating id using ida for minor only in case of
MISC_DYNAMIC_MINOR but misc_minor_free was always freeing ids
using ida_free causing a mismatch and following warn:
> > WARNING: CPU: 0 PID: 159 at lib/idr.c:525 ida_free+0x3e0/0x41f
> > ida_free called for id=127 which is not allocated.
> > <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
...
> > [<60941eb4>] ida_free+0x3e0/0x41f
> > [<605ac993>] misc_minor_free+0x3e/0xbc
> > [<605acb82>] misc_deregister+0x171/0x1b3
misc_minor_alloc is changed to allocate id from ida for all minors
falling in the range of dynamic/ misc dynamic minors
Fixes: ab760791c0cf ("char: misc: Increase the maximum number of dynamic misc devices to 1048448")
Signed-off-by: Vimal Agrawal <avimalin(a)gmail.com>
Cc: stable(a)vger.kernel.org
---
v2: Added Fixes:
added missed case for static minor in misc_minor_alloc
v3: Removed kunit changes as that will be added as second patch in this two patch series
v4: Updated Signed-off-by: to match from:
drivers/char/misc.c | 39 ++++++++++++++++++++++++++++++---------
1 file changed, 30 insertions(+), 9 deletions(-)
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 541edc26ec89..2cf595d2e10b 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -63,16 +63,30 @@ static DEFINE_MUTEX(misc_mtx);
#define DYNAMIC_MINORS 128 /* like dynamic majors */
static DEFINE_IDA(misc_minors_ida);
-static int misc_minor_alloc(void)
+static int misc_minor_alloc(int minor)
{
- int ret;
-
- ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
- if (ret >= 0) {
- ret = DYNAMIC_MINORS - ret - 1;
+ int ret = 0;
+
+ if (minor == MISC_DYNAMIC_MINOR) {
+ /* allocate free id */
+ ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
+ if (ret >= 0) {
+ ret = DYNAMIC_MINORS - ret - 1;
+ } else {
+ ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
+ MINORMASK, GFP_KERNEL);
+ }
} else {
- ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
- MINORMASK, GFP_KERNEL);
+ /* specific minor, check if it is in dynamic or misc dynamic range */
+ if (minor < DYNAMIC_MINORS) {
+ minor = DYNAMIC_MINORS - minor - 1;
+ ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
+ } else if (minor > MISC_DYNAMIC_MINOR) {
+ ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
+ } else {
+ /* case of non-dynamic minors, no need to allocate id */
+ ret = 0;
+ }
}
return ret;
}
@@ -219,7 +233,7 @@ int misc_register(struct miscdevice *misc)
mutex_lock(&misc_mtx);
if (is_dynamic) {
- int i = misc_minor_alloc();
+ int i = misc_minor_alloc(misc->minor);
if (i < 0) {
err = -EBUSY;
@@ -228,6 +242,7 @@ int misc_register(struct miscdevice *misc)
misc->minor = i;
} else {
struct miscdevice *c;
+ int i;
list_for_each_entry(c, &misc_list, list) {
if (c->minor == misc->minor) {
@@ -235,6 +250,12 @@ int misc_register(struct miscdevice *misc)
goto out;
}
}
+
+ i = misc_minor_alloc(misc->minor);
+ if (i < 0) {
+ err = -EBUSY;
+ goto out;
+ }
}
dev = MKDEV(MISC_MAJOR, misc->minor);
--
2.17.1
misc_minor_alloc was allocating id using ida for minor only in case of
MISC_DYNAMIC_MINOR but misc_minor_free was always freeing ids
using ida_free causing a mismatch and following warn:
> > WARNING: CPU: 0 PID: 159 at lib/idr.c:525 ida_free+0x3e0/0x41f
> > ida_free called for id=127 which is not allocated.
> > <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
...
> > [<60941eb4>] ida_free+0x3e0/0x41f
> > [<605ac993>] misc_minor_free+0x3e/0xbc
> > [<605acb82>] misc_deregister+0x171/0x1b3
misc_minor_alloc is changed to allocate id from ida for all minors
falling in the range of dynamic/ misc dynamic minors
Fixes: ab760791c0cf ("char: misc: Increase the maximum number of dynamic misc devices to 1048448")
Signed-off-by: Vimal Agrawal <vimal.agrawal(a)sophos.com>
Cc: stable(a)vger.kernel.org
---
v2: Added Fixes:
added missed case for static minor in misc_minor_alloc
v3: removed kunit changes as that will be added as second patch in this two patch series
drivers/char/misc.c | 35 ++++++++++++++++++++++++++++-------
1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 541edc26ec89..9d0cd3459b4f 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -63,16 +63,30 @@ static DEFINE_MUTEX(misc_mtx);
#define DYNAMIC_MINORS 128 /* like dynamic majors */
static DEFINE_IDA(misc_minors_ida);
-static int misc_minor_alloc(void)
+static int misc_minor_alloc(int minor)
{
int ret = 0;
- ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
- if (ret >= 0) {
- ret = DYNAMIC_MINORS - ret - 1;
- } else {
- ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
+ if (minor == MISC_DYNAMIC_MINOR) {
+ /* allocate free id */
+ ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
+ if (ret >= 0) {
+ ret = DYNAMIC_MINORS - ret - 1;
+ } else {
+ ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
MINORMASK, GFP_KERNEL);
+ }
+ } else {
+ /* specific minor, check if it is in dynamic or misc dynamic range */
+ if (minor < DYNAMIC_MINORS) {
+ minor = DYNAMIC_MINORS - minor - 1;
+ ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
+ } else if (minor > MISC_DYNAMIC_MINOR) {
+ ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
+ } else {
+ /* case of non-dynamic minors, no need to allocate id */
+ ret = 0;
+ }
}
return ret;
}
@@ -219,7 +233,7 @@ int misc_register(struct miscdevice *misc)
mutex_lock(&misc_mtx);
if (is_dynamic) {
- int i = misc_minor_alloc();
+ int i = misc_minor_alloc(misc->minor);
if (i < 0) {
err = -EBUSY;
@@ -228,6 +242,7 @@ int misc_register(struct miscdevice *misc)
misc->minor = i;
} else {
struct miscdevice *c;
+ int i;
list_for_each_entry(c, &misc_list, list) {
if (c->minor == misc->minor) {
@@ -235,6 +250,12 @@ int misc_register(struct miscdevice *misc)
goto out;
}
}
+
+ i = misc_minor_alloc(misc->minor);
+ if (i < 0) {
+ err = -EBUSY;
+ goto out;
+ }
}
dev = MKDEV(MISC_MAJOR, misc->minor);
--
2.17.1
misc_minor_alloc was allocating id using ida for minor only in case of
MISC_DYNAMIC_MINOR but misc_minor_free was always freeing ids
using ida_free causing a mismatch and following warn:
> > WARNING: CPU: 0 PID: 159 at lib/idr.c:525 ida_free+0x3e0/0x41f
> > ida_free called for id=127 which is not allocated.
> > <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
...
> > [<60941eb4>] ida_free+0x3e0/0x41f
> > [<605ac993>] misc_minor_free+0x3e/0xbc
> > [<605acb82>] misc_deregister+0x171/0x1b3
misc_minor_alloc is changed to allocate id from ida for all minors
falling in the range of dynamic/ misc dynamic minors
Fixes: ab760791c0cf ("char: misc: Increase the maximum number of dynamic misc devices to 1048448")
Signed-off-by: Vimal Agrawal <vimal.agrawal(a)sophos.com>
Cc: stable(a)vger.kernel.org
---
v2: Added Fixes:
added missed case for static minor in misc_minor_alloc
v3: removed kunit changes as that will be added as second patch in this two patch series
drivers/char/misc.c | 35 ++++++++++++++++++++++++++++-------
1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 541edc26ec89..9d0cd3459b4f 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -63,16 +63,30 @@ static DEFINE_MUTEX(misc_mtx);
#define DYNAMIC_MINORS 128 /* like dynamic majors */
static DEFINE_IDA(misc_minors_ida);
-static int misc_minor_alloc(void)
+static int misc_minor_alloc(int minor)
{
int ret;
- ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
- if (ret >= 0) {
- ret = DYNAMIC_MINORS - ret - 1;
- } else {
- ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
+ if (minor == MISC_DYNAMIC_MINOR) {
+ /* allocate free id */
+ ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
+ if (ret >= 0) {
+ ret = DYNAMIC_MINORS - ret - 1;
+ } else {
+ ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
MINORMASK, GFP_KERNEL);
+ }
+ } else {
+ /* specific minor, check if it is in dynamic or misc dynamic range */
+ if (minor < DYNAMIC_MINORS) {
+ minor = DYNAMIC_MINORS - minor - 1;
+ ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
+ } else if (minor > MISC_DYNAMIC_MINOR) {
+ ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
+ } else {
+ /* case of non-dynamic minors, no need to allocate id */
+ ret = 0;
+ }
}
return ret;
}
@@ -219,7 +233,7 @@ int misc_register(struct miscdevice *misc)
mutex_lock(&misc_mtx);
if (is_dynamic) {
- int i = misc_minor_alloc();
+ int i = misc_minor_alloc(misc->minor);
if (i < 0) {
err = -EBUSY;
@@ -228,6 +242,7 @@ int misc_register(struct miscdevice *misc)
misc->minor = i;
} else {
struct miscdevice *c;
+ int i;
list_for_each_entry(c, &misc_list, list) {
if (c->minor == misc->minor) {
@@ -235,6 +250,12 @@ int misc_register(struct miscdevice *misc)
goto out;
}
}
+
+ i = misc_minor_alloc(misc->minor);
+ if (i < 0) {
+ err = -EBUSY;
+ goto out;
+ }
}
dev = MKDEV(MISC_MAJOR, misc->minor);
--
2.17.1
If there is an error during some initialization related to firmware,
the buffers dp->tx_ring[i].tx_status are released.
However this is released again when the device is unbinded (ath12k_pci),
and we get:
WARNING: CPU: 0 PID: 2098 at mm/slub.c:4689 free_large_kmalloc+0x4d/0x80
Call Trace:
free_large_kmalloc
ath12k_dp_free
ath12k_core_deinit
ath12k_pci_remove
...
The issue is always reproducible from a VM because the MSI addressing
initialization is failing.
In order to fix the issue, just set the buffers to NULL after releasing in
order to avoid the double free.
cc: stable(a)vger.kernel.org
Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
---
v3:
- Remove unnecessary check to not free if buffer is NULL.
- Trim backtrace.
- Fix typos.
v2: https://lore.kernel.org/linux-wireless/20241016123722.206899-1-jtornosm@red…
v1: https://lore.kernel.org/linux-wireless/20241010175102.207324-3-jtornosm@red…
drivers/net/wireless/ath/ath12k/dp.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c
index 789d430e4455..15061782a2df 100644
--- a/drivers/net/wireless/ath/ath12k/dp.c
+++ b/drivers/net/wireless/ath/ath12k/dp.c
@@ -1277,8 +1277,10 @@ void ath12k_dp_free(struct ath12k_base *ab)
ath12k_dp_rx_reo_cmd_list_cleanup(ab);
- for (i = 0; i < ab->hw_params->max_tx_ring; i++)
+ for (i = 0; i < ab->hw_params->max_tx_ring; i++) {
kfree(dp->tx_ring[i].tx_status);
+ dp->tx_ring[i].tx_status = NULL;
+ }
ath12k_dp_rx_free(ab);
/* Deinit any SOC level resource */
--
2.47.0
If there is an error during some initialization related to firmware,
the function ath12k_dp_cc_cleanup is called to release resources.
However this is released again when the device is unbinded (ath12k_pci),
and we get:
BUG: kernel NULL pointer dereference, address: 0000000000000020
at RIP: 0010:ath12k_dp_cc_cleanup.part.0+0xb6/0x500 [ath12k]
Call Trace:
ath12k_dp_cc_cleanup
ath12k_dp_free
ath12k_core_deinit
ath12k_pci_remove
...
The issue is always reproducible from a VM because the MSI addressing
initialization is failing.
In order to fix the issue, just set to NULL the released structure in
ath12k_dp_cc_cleanup at the end.
cc: stable(a)vger.kernel.org
Fixes: d889913205cf ("wifi: ath12k: driver for Qualcomm Wi-Fi 7 devices")
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
---
v3:
- Trim backtrace.
- Fix typos.
v2: https://lore.kernel.org/linux-wireless/20241016123452.206671-1-jtornosm@red…
v1: https://lore.kernel.org/linux-wireless/20241010175102.207324-2-jtornosm@red…
drivers/net/wireless/ath/ath12k/dp.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/net/wireless/ath/ath12k/dp.c b/drivers/net/wireless/ath/ath12k/dp.c
index 61aa78d8bd8c..789d430e4455 100644
--- a/drivers/net/wireless/ath/ath12k/dp.c
+++ b/drivers/net/wireless/ath/ath12k/dp.c
@@ -1241,6 +1241,7 @@ static void ath12k_dp_cc_cleanup(struct ath12k_base *ab)
}
kfree(dp->spt_info);
+ dp->spt_info = NULL;
}
static void ath12k_dp_reoq_lut_cleanup(struct ath12k_base *ab)
--
2.46.2
The quilt patch titled
Subject: mm/mglru: only clear kswapd_failures if reclaimable
has been removed from the -mm tree. Its filename was
mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Wei Xu <weixugc(a)google.com>
Subject: mm/mglru: only clear kswapd_failures if reclaimable
Date: Mon, 14 Oct 2024 22:12:11 +0000
lru_gen_shrink_node() unconditionally clears kswapd_failures, which can
prevent kswapd from sleeping and cause 100% kswapd cpu usage even when
kswapd repeatedly fails to make progress in reclaim.
Only clear kswap_failures in lru_gen_shrink_node() if reclaim makes some
progress, similar to shrink_node().
I happened to run into this problem in one of my tests recently. It
requires a combination of several conditions: The allocator needs to
allocate a right amount of pages such that it can wake up kswapd
without itself being OOM killed; there is no memory for kswapd to
reclaim (My test disables swap and cleans page cache first); no other
process frees enough memory at the same time.
Link: https://lkml.kernel.org/r/20241014221211.832591-1-weixugc@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Wei Xu <weixugc(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Brian Geffon <bgeffon(a)google.com>
Cc: Jan Alexander Steffens <heftig(a)archlinux.org>
Cc: Suleiman Souhlal <suleiman(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/vmscan.c~mm-mglru-only-clear-kswapd_failures-if-reclaimable
+++ a/mm/vmscan.c
@@ -4963,8 +4963,8 @@ static void lru_gen_shrink_node(struct p
blk_finish_plug(&plug);
done:
- /* kswapd should never fail */
- pgdat->kswapd_failures = 0;
+ if (sc->nr_reclaimed > reclaimed)
+ pgdat->kswapd_failures = 0;
}
/******************************************************************************
_
Patches currently in -mm which might be from weixugc(a)google.com are
The quilt patch titled
Subject: mm/swapfile: skip HugeTLB pages for unuse_vma
has been removed from the -mm tree. Its filename was
mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Liu Shixin <liushixin2(a)huawei.com>
Subject: mm/swapfile: skip HugeTLB pages for unuse_vma
Date: Tue, 15 Oct 2024 09:45:21 +0800
I got a bad pud error and lost a 1GB HugeTLB when calling swapoff. The
problem can be reproduced by the following steps:
1. Allocate an anonymous 1GB HugeTLB and some other anonymous memory.
2. Swapout the above anonymous memory.
3. run swapoff and we will get a bad pud error in kernel message:
mm/pgtable-generic.c:42: bad pud 00000000743d215d(84000001400000e7)
We can tell that pud_clear_bad is called by pud_none_or_clear_bad in
unuse_pud_range() by ftrace. And therefore the HugeTLB pages will never
be freed because we lost it from page table. We can skip HugeTLB pages
for unuse_vma to fix it.
Link: https://lkml.kernel.org/r/20241015014521.570237-1-liushixin2@huawei.com
Fixes: 0fe6e20b9c4c ("hugetlb, rmap: add reverse mapping for hugepage")
Signed-off-by: Liu Shixin <liushixin2(a)huawei.com>
Acked-by: Muchun Song <muchun.song(a)linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swapfile.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/swapfile.c~mm-swapfile-skip-hugetlb-pages-for-unuse_vma
+++ a/mm/swapfile.c
@@ -2313,7 +2313,7 @@ static int unuse_mm(struct mm_struct *mm
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- if (vma->anon_vma) {
+ if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
ret = unuse_vma(vma, type);
if (ret)
break;
_
Patches currently in -mm which might be from liushixin2(a)huawei.com are
The quilt patch titled
Subject: mm: don't install PMD mappings when THPs are disabled by the hw/process/vma
has been removed from the -mm tree. Its filename was
mm-dont-install-pmd-mappings-when-thps-are-disabled-by-the-hw-process-vma.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: mm: don't install PMD mappings when THPs are disabled by the hw/process/vma
Date: Fri, 11 Oct 2024 12:24:45 +0200
We (or rather, readahead logic :) ) might be allocating a THP in the
pagecache and then try mapping it into a process that explicitly disabled
THP: we might end up installing PMD mappings.
This is a problem for s390x KVM, which explicitly remaps all PMD-mapped
THPs to be PTE-mapped in s390_enable_sie()->thp_split_mm(), before
starting the VM.
For example, starting a VM backed on a file system with large folios
supported makes the VM crash when the VM tries accessing such a mapping
using KVM.
Is it also a problem when the HW disabled THP using
TRANSPARENT_HUGEPAGE_UNSUPPORTED? At least on x86 this would be the case
without X86_FEATURE_PSE.
In the future, we might be able to do better on s390x and only disallow
PMD mappings -- what s390x and likely TRANSPARENT_HUGEPAGE_UNSUPPORTED
really wants. For now, fix it by essentially performing the same check as
would be done in __thp_vma_allowable_orders() or in shmem code, where this
works as expected, and disallow PMD mappings, making us fallback to PTE
mappings.
Link: https://lkml.kernel.org/r/20241011102445.934409-3-david@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reported-by: Leo Fu <bfu(a)redhat.com>
Tested-by: Thomas Huth <thuth(a)redhat.com>
Cc: Thomas Huth <thuth(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Janosch Frank <frankja(a)linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/mm/memory.c~mm-dont-install-pmd-mappings-when-thps-are-disabled-by-the-hw-process-vma
+++ a/mm/memory.c
@@ -4920,6 +4920,15 @@ vm_fault_t do_set_pmd(struct vm_fault *v
pmd_t entry;
vm_fault_t ret = VM_FAULT_FALLBACK;
+ /*
+ * It is too late to allocate a small folio, we already have a large
+ * folio in the pagecache: especially s390 KVM cannot tolerate any
+ * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
+ * PMD mappings if THPs are disabled.
+ */
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ return ret;
+
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
return ret;
_
Patches currently in -mm which might be from david(a)redhat.com are
mm-pagewalk-fix-usage-of-pmd_leaf-pud_leaf-without-present-check.patch
selftests-mm-hugetlb_fault_after_madv-use-default-hguetlb-page-size.patch
selftests-mm-hugetlb_fault_after_madv-improve-test-output.patch
The quilt patch titled
Subject: mm: huge_memory: add vma_thp_disabled() and thp_disabled_by_hw()
has been removed from the -mm tree. Its filename was
mm-huge_memory-add-vma_thp_disabled-and-thp_disabled_by_hw.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Subject: mm: huge_memory: add vma_thp_disabled() and thp_disabled_by_hw()
Date: Fri, 11 Oct 2024 12:24:44 +0200
Patch series "mm: don't install PMD mappings when THPs are disabled by the
hw/process/vma".
During testing, it was found that we can get PMD mappings in processes
where THP (and more precisely, PMD mappings) are supposed to be disabled.
While it works as expected for anon+shmem, the pagecache is the
problematic bit.
For s390 KVM this currently means that a VM backed by a file located on
filesystem with large folio support can crash when KVM tries accessing the
problematic page, because the readahead logic might decide to use a
PMD-sized THP and faulting it into the page tables will install a PMD
mapping, something that s390 KVM cannot tolerate.
This might also be a problem with HW that does not support PMD mappings,
but I did not try reproducing it.
Fix it by respecting the ways to disable THPs when deciding whether we can
install a PMD mapping. khugepaged should already be taking care of not
collapsing if THPs are effectively disabled for the hw/process/vma.
This patch (of 2):
Add vma_thp_disabled() and thp_disabled_by_hw() helpers to be shared by
shmem_allowable_huge_orders() and __thp_vma_allowable_orders().
[david(a)redhat.com: rename to vma_thp_disabled(), split out thp_disabled_by_hw() ]
Link: https://lkml.kernel.org/r/20241011102445.934409-2-david@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reported-by: Leo Fu <bfu(a)redhat.com>
Tested-by: Thomas Huth <thuth(a)redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Boqiao Fu <bfu(a)redhat.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Janosch Frank <frankja(a)linux.ibm.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/huge_mm.h | 18 ++++++++++++++++++
mm/huge_memory.c | 13 +------------
mm/shmem.c | 7 +------
3 files changed, 20 insertions(+), 18 deletions(-)
--- a/include/linux/huge_mm.h~mm-huge_memory-add-vma_thp_disabled-and-thp_disabled_by_hw
+++ a/include/linux/huge_mm.h
@@ -322,6 +322,24 @@ struct thpsize {
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
+static inline bool vma_thp_disabled(struct vm_area_struct *vma,
+ unsigned long vm_flags)
+{
+ /*
+ * Explicitly disabled through madvise or prctl, or some
+ * architectures may disable THP for some mappings, for
+ * example, s390 kvm.
+ */
+ return (vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
+}
+
+static inline bool thp_disabled_by_hw(void)
+{
+ /* If the hardware/firmware marked hugepage support disabled. */
+ return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED);
+}
+
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
--- a/mm/huge_memory.c~mm-huge_memory-add-vma_thp_disabled-and-thp_disabled_by_hw
+++ a/mm/huge_memory.c
@@ -109,18 +109,7 @@ unsigned long __thp_vma_allowable_orders
if (!vma->vm_mm) /* vdso */
return 0;
- /*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
- * */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return 0;
- /*
- * If the hardware/firmware marked hugepage support disabled.
- */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
--- a/mm/shmem.c~mm-huge_memory-add-vma_thp_disabled-and-thp_disabled_by_hw
+++ a/mm/shmem.c
@@ -1664,12 +1664,7 @@ unsigned long shmem_allowable_huge_order
loff_t i_size;
int order;
- if (vma && ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
- return 0;
-
- /* If the hardware/firmware marked hugepage support disabled. */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
return 0;
global_huge = shmem_huge_global_enabled(inode, index, write_end,
_
Patches currently in -mm which might be from wangkefeng.wang(a)huawei.com are
mm-remove-unused-hugepage-for-vma_alloc_folio.patch
The quilt patch titled
Subject: mm: khugepaged: fix the arguments order in khugepaged_collapse_file trace point
has been removed from the -mm tree. Its filename was
mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Yang Shi <yang(a)os.amperecomputing.com>
Subject: mm: khugepaged: fix the arguments order in khugepaged_collapse_file trace point
Date: Fri, 11 Oct 2024 18:17:02 -0700
The "addr" and "is_shmem" arguments have different order in TP_PROTO and
TP_ARGS. This resulted in the incorrect trace result:
text-hugepage-644429 [276] 392092.878683: mm_khugepaged_collapse_file:
mm=0xffff20025d52c440, hpage_pfn=0x200678c00, index=512, addr=1, is_shmem=0,
filename=text-hugepage, nr=512, result=failed
The value of "addr" is wrong because it was treated as bool value, the
type of is_shmem.
Fix the order in TP_PROTO to keep "addr" is before "is_shmem" since the
original patch review suggested this order to achieve best packing.
And use "lx" for "addr" instead of "ld" in TP_printk because address is
typically shown in hex.
After the fix, the trace result looks correct:
text-hugepage-7291 [004] 128.627251: mm_khugepaged_collapse_file:
mm=0xffff0001328f9500, hpage_pfn=0x20016ea00, index=512, addr=0x400000,
is_shmem=0, filename=text-hugepage, nr=512, result=failed
Link: https://lkml.kernel.org/r/20241012011702.1084846-1-yang@os.amperecomputing.…
Fixes: 4c9473e87e75 ("mm/khugepaged: add tracepoint to collapse_file()")
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
Cc: Gautam Menghani <gautammenghani201(a)gmail.com>
Cc: Steven Rostedt (Google) <rostedt(a)goodmis.org>
Cc: <stable(a)vger.kernel.org> [6.2+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/trace/events/huge_memory.h | 4 ++--
mm/khugepaged.c | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
--- a/include/trace/events/huge_memory.h~mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point
+++ a/include/trace/events/huge_memory.h
@@ -208,7 +208,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
TRACE_EVENT(mm_khugepaged_collapse_file,
TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
- bool is_shmem, unsigned long addr, struct file *file,
+ unsigned long addr, bool is_shmem, struct file *file,
int nr, int result),
TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
TP_STRUCT__entry(
@@ -233,7 +233,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
__entry->result = result;
),
- TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s",
+ TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%lx, is_shmem=%d, filename=%s, nr=%d, result=%s",
__entry->mm,
__entry->hpfn,
__entry->index,
--- a/mm/khugepaged.c~mm-khugepaged-fix-the-arguments-order-in-khugepaged_collapse_file-trace-point
+++ a/mm/khugepaged.c
@@ -2227,7 +2227,7 @@ rollback:
folio_put(new_folio);
out:
VM_BUG_ON(!list_empty(&pagelist));
- trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
+ trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
return result;
}
_
Patches currently in -mm which might be from yang(a)os.amperecomputing.com are
The quilt patch titled
Subject: mm/damon/tests/sysfs-kunit.h: fix memory leak in damon_sysfs_test_add_targets()
has been removed from the -mm tree. Its filename was
mm-damon-fix-memory-leak-in-damon_sysfs_test_add_targets.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jinjie Ruan <ruanjinjie(a)huawei.com>
Subject: mm/damon/tests/sysfs-kunit.h: fix memory leak in damon_sysfs_test_add_targets()
Date: Thu, 10 Oct 2024 20:53:23 +0800
The sysfs_target->regions allocated in damon_sysfs_regions_alloc() is not
freed in damon_sysfs_test_add_targets(), which cause the following memory
leak, free it to fix it.
unreferenced object 0xffffff80c2a8db80 (size 96):
comm "kunit_try_catch", pid 187, jiffies 4294894363
hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc 0):
[<0000000001e3714d>] kmemleak_alloc+0x34/0x40
[<000000008e6835c1>] __kmalloc_cache_noprof+0x26c/0x2f4
[<000000001286d9f8>] damon_sysfs_test_add_targets+0x1cc/0x738
[<0000000032ef8f77>] kunit_try_run_case+0x13c/0x3ac
[<00000000f3edea23>] kunit_generic_run_threadfn_adapter+0x80/0xec
[<00000000adf936cf>] kthread+0x2e8/0x374
[<0000000041bb1628>] ret_from_fork+0x10/0x20
Link: https://lkml.kernel.org/r/20241010125323.3127187-1-ruanjinjie@huawei.com
Fixes: b8ee5575f763 ("mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()")
Signed-off-by: Jinjie Ruan <ruanjinjie(a)huawei.com>
Reviewed-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/tests/sysfs-kunit.h | 1 +
1 file changed, 1 insertion(+)
--- a/mm/damon/tests/sysfs-kunit.h~mm-damon-fix-memory-leak-in-damon_sysfs_test_add_targets
+++ a/mm/damon/tests/sysfs-kunit.h
@@ -67,6 +67,7 @@ static void damon_sysfs_test_add_targets
damon_destroy_ctx(ctx);
kfree(sysfs_targets->targets_arr);
kfree(sysfs_targets);
+ kfree(sysfs_target->regions);
kfree(sysfs_target);
}
_
Patches currently in -mm which might be from ruanjinjie(a)huawei.com are
The quilt patch titled
Subject: lib: alloc_tag_module_unload must wait for pending kfree_rcu calls
has been removed from the -mm tree. Its filename was
lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Florian Westphal <fw(a)strlen.de>
Subject: lib: alloc_tag_module_unload must wait for pending kfree_rcu calls
Date: Mon, 7 Oct 2024 22:52:24 +0200
Ben Greear reports following splat:
------------[ cut here ]------------
net/netfilter/nf_nat_core.c:1114 module nf_nat func:nf_nat_register_fn has 256 allocated at module unload
WARNING: CPU: 1 PID: 10421 at lib/alloc_tag.c:168 alloc_tag_module_unload+0x22b/0x3f0
Modules linked in: nf_nat(-) btrfs ufs qnx4 hfsplus hfs minix vfat msdos fat
...
Hardware name: Default string Default string/SKYBAY, BIOS 5.12 08/04/2020
RIP: 0010:alloc_tag_module_unload+0x22b/0x3f0
codetag_unload_module+0x19b/0x2a0
? codetag_load_module+0x80/0x80
nf_nat module exit calls kfree_rcu on those addresses, but the free
operation is likely still pending by the time alloc_tag checks for leaks.
Wait for outstanding kfree_rcu operations to complete before checking
resolves this warning.
Reproducer:
unshare -n iptables-nft -t nat -A PREROUTING -p tcp
grep nf_nat /proc/allocinfo # will list 4 allocations
rmmod nft_chain_nat
rmmod nf_nat # will WARN.
[akpm(a)linux-foundation.org: add comment]
Link: https://lkml.kernel.org/r/20241007205236.11847-1-fw@strlen.de
Fixes: a473573964e5 ("lib: code tagging module support")
Signed-off-by: Florian Westphal <fw(a)strlen.de>
Reported-by: Ben Greear <greearb(a)candelatech.com>
Closes: https://lore.kernel.org/netdev/bdaaef9d-4364-4171-b82b-bcfc12e207eb@candela…
Cc: Uladzislau Rezki <urezki(a)gmail.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/codetag.c | 3 +++
1 file changed, 3 insertions(+)
--- a/lib/codetag.c~lib-alloc_tag_module_unload-must-wait-for-pending-kfree_rcu-calls
+++ a/lib/codetag.c
@@ -228,6 +228,9 @@ bool codetag_unload_module(struct module
if (!mod)
return true;
+ /* await any module's kfree_rcu() operations to complete */
+ kvfree_rcu_barrier();
+
mutex_lock(&codetag_lock);
list_for_each_entry(cttype, &codetag_types, link) {
struct codetag_module *found = NULL;
_
Patches currently in -mm which might be from fw(a)strlen.de are
The quilt patch titled
Subject: mm/mremap: fix move_normal_pmd/retract_page_tables race
has been removed from the -mm tree. Its filename was
mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jann Horn <jannh(a)google.com>
Subject: mm/mremap: fix move_normal_pmd/retract_page_tables race
Date: Mon, 07 Oct 2024 23:42:04 +0200
In mremap(), move_page_tables() looks at the type of the PMD entry and the
specified address range to figure out by which method the next chunk of
page table entries should be moved.
At that point, the mmap_lock is held in write mode, but no rmap locks are
held yet. For PMD entries that point to page tables and are fully covered
by the source address range, move_pgt_entry(NORMAL_PMD, ...) is called,
which first takes rmap locks, then does move_normal_pmd().
move_normal_pmd() takes the necessary page table locks at source and
destination, then moves an entire page table from the source to the
destination.
The problem is: The rmap locks, which protect against concurrent page
table removal by retract_page_tables() in the THP code, are only taken
after the PMD entry has been read and it has been decided how to move it.
So we can race as follows (with two processes that have mappings of the
same tmpfs file that is stored on a tmpfs mount with huge=advise); note
that process A accesses page tables through the MM while process B does it
through the file rmap:
process A process B
========= =========
mremap
mremap_to
move_vma
move_page_tables
get_old_pmd
alloc_new_pmd
*** PREEMPT ***
madvise(MADV_COLLAPSE)
do_madvise
madvise_walk_vmas
madvise_vma_behavior
madvise_collapse
hpage_collapse_scan_file
collapse_file
retract_page_tables
i_mmap_lock_read(mapping)
pmdp_collapse_flush
i_mmap_unlock_read(mapping)
move_pgt_entry(NORMAL_PMD, ...)
take_rmap_locks
move_normal_pmd
drop_rmap_locks
When this happens, move_normal_pmd() can end up creating bogus PMD entries
in the line `pmd_populate(mm, new_pmd, pmd_pgtable(pmd))`. The effect
depends on arch-specific and machine-specific details; on x86, you can end
up with physical page 0 mapped as a page table, which is likely
exploitable for user->kernel privilege escalation.
Fix the race by letting process B recheck that the PMD still points to a
page table after the rmap locks have been taken. Otherwise, we bail and
let the caller fall back to the PTE-level copying path, which will then
bail immediately at the pmd_none() check.
Bug reachability: Reaching this bug requires that you can create
shmem/file THP mappings - anonymous THP uses different code that doesn't
zap stuff under rmap locks. File THP is gated on an experimental config
flag (CONFIG_READ_ONLY_THP_FOR_FS), so on normal distro kernels you need
shmem THP to hit this bug. As far as I know, getting shmem THP normally
requires that you can mount your own tmpfs with the right mount flags,
which would require creating your own user+mount namespace; though I don't
know if some distros maybe enable shmem THP by default or something like
that.
Bug impact: This issue can likely be used for user->kernel privilege
escalation when it is reachable.
Link: https://lkml.kernel.org/r/20241007-move_normal_pmd-vs-collapse-fix-2-v1-1-5…
Fixes: 1d65b771bc08 ("mm/khugepaged: retract_page_tables() without mmap or vma lock")
Signed-off-by: Jann Horn <jannh(a)google.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Co-developed-by: David Hildenbrand <david(a)redhat.com>
Closes: https://project-zero.issues.chromium.org/371047675
Acked-by: Qi Zheng <zhengqi.arch(a)bytedance.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Joel Fernandes <joel(a)joelfernandes.org>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mremap.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
--- a/mm/mremap.c~mm-mremap-fix-move_normal_pmd-retract_page_tables-race
+++ a/mm/mremap.c
@@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_ar
{
spinlock_t *old_ptl, *new_ptl;
struct mm_struct *mm = vma->vm_mm;
+ bool res = false;
pmd_t pmd;
if (!arch_supports_page_table_move())
@@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_ar
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
- /* Clear the pmd */
pmd = *old_pmd;
+
+ /* Racing with collapse? */
+ if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
+ goto out_unlock;
+ /* Clear the pmd */
pmd_clear(old_pmd);
+ res = true;
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+out_unlock:
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
- return true;
+ return res;
}
#else
static inline bool move_normal_pmd(struct vm_area_struct *vma,
_
Patches currently in -mm which might be from jannh(a)google.com are
mm-mark-mas-allocation-in-vms_abort_munmap_vmas-as-__gfp_nofail.patch
The quilt patch titled
Subject: selftests/mm: fix deadlock for fork after pthread_create on ARM
has been removed from the -mm tree. Its filename was
selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Edward Liaw <edliaw(a)google.com>
Subject: selftests/mm: fix deadlock for fork after pthread_create on ARM
Date: Thu, 3 Oct 2024 21:17:11 +0000
On Android with arm, there is some synchronization needed to avoid a
deadlock when forking after pthread_create.
Link: https://lkml.kernel.org/r/20241003211716.371786-3-edliaw@google.com
Fixes: cff294582798 ("selftests/mm: extend and rename uffd pagemap test")
Signed-off-by: Edward Liaw <edliaw(a)google.com>
Cc: Lokesh Gidra <lokeshgidra(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-unit-tests.c | 7 +++++++
1 file changed, 7 insertions(+)
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -241,6 +241,9 @@ static void *fork_event_consumer(void *d
fork_event_args *args = data;
struct uffd_msg msg = { 0 };
+ /* Ready for parent thread to fork */
+ pthread_barrier_wait(&ready_for_fork);
+
/* Read until a full msg received */
while (uffd_read_msg(args->parent_uffd, &msg));
@@ -308,8 +311,12 @@ static int pagemap_test_fork(int uffd, b
/* Prepare a thread to resolve EVENT_FORK */
if (with_event) {
+ pthread_barrier_init(&ready_for_fork, NULL, 2);
if (pthread_create(&thread, NULL, fork_event_consumer, &args))
err("pthread_create()");
+ /* Wait for child thread to start before forking */
+ pthread_barrier_wait(&ready_for_fork);
+ pthread_barrier_destroy(&ready_for_fork);
}
child = fork();
_
Patches currently in -mm which might be from edliaw(a)google.com are
The quilt patch titled
Subject: selftests/mm: replace atomic_bool with pthread_barrier_t
has been removed from the -mm tree. Its filename was
selftests-mm-replace-atomic_bool-with-pthread_barrier_t.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Edward Liaw <edliaw(a)google.com>
Subject: selftests/mm: replace atomic_bool with pthread_barrier_t
Date: Thu, 3 Oct 2024 21:17:10 +0000
Patch series "selftests/mm: fix deadlock after pthread_create".
On Android arm, pthread_create followed by a fork caused a deadlock in the
case where the fork required work to be completed by the created thread.
Update the synchronization primitive to use pthread_barrier instead of
atomic_bool.
Apply the same fix to the wp-fork-with-event test.
This patch (of 2):
Swap synchronization primitive with pthread_barrier, so that stdatomic.h
does not need to be included.
The synchronization is needed on Android ARM64; we see a deadlock with
pthread_create when the parent thread races forward before the child has a
chance to start doing work.
Link: https://lkml.kernel.org/r/20241003211716.371786-1-edliaw@google.com
Link: https://lkml.kernel.org/r/20241003211716.371786-2-edliaw@google.com
Fixes: cff294582798 ("selftests/mm: extend and rename uffd pagemap test")
Signed-off-by: Edward Liaw <edliaw(a)google.com>
Cc: Lokesh Gidra <lokeshgidra(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/uffd-common.c | 5 +++--
tools/testing/selftests/mm/uffd-common.h | 3 +--
tools/testing/selftests/mm/uffd-unit-tests.c | 14 ++++++++------
3 files changed, 12 insertions(+), 10 deletions(-)
--- a/tools/testing/selftests/mm/uffd-common.c~selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-common.c
@@ -18,7 +18,7 @@ bool test_uffdio_wp = true;
unsigned long long *count_verify;
uffd_test_ops_t *uffd_test_ops;
uffd_test_case_ops_t *uffd_test_case_ops;
-atomic_bool ready_for_fork;
+pthread_barrier_t ready_for_fork;
static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
{
@@ -519,7 +519,8 @@ void *uffd_poll_thread(void *arg)
pollfd[1].fd = pipefd[cpu*2];
pollfd[1].events = POLLIN;
- ready_for_fork = true;
+ /* Ready for parent thread to fork */
+ pthread_barrier_wait(&ready_for_fork);
for (;;) {
ret = poll(pollfd, 2, -1);
--- a/tools/testing/selftests/mm/uffd-common.h~selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-common.h
@@ -33,7 +33,6 @@
#include <inttypes.h>
#include <stdint.h>
#include <sys/random.h>
-#include <stdatomic.h>
#include "../kselftest.h"
#include "vm_util.h"
@@ -105,7 +104,7 @@ extern bool map_shared;
extern bool test_uffdio_wp;
extern unsigned long long *count_verify;
extern volatile bool test_uffdio_copy_eexist;
-extern atomic_bool ready_for_fork;
+extern pthread_barrier_t ready_for_fork;
extern uffd_test_ops_t anon_uffd_test_ops;
extern uffd_test_ops_t shmem_uffd_test_ops;
--- a/tools/testing/selftests/mm/uffd-unit-tests.c~selftests-mm-replace-atomic_bool-with-pthread_barrier_t
+++ a/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -774,7 +774,7 @@ static void uffd_sigbus_test_common(bool
char c;
struct uffd_args args = { 0 };
- ready_for_fork = false;
+ pthread_barrier_init(&ready_for_fork, NULL, 2);
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
@@ -791,8 +791,9 @@ static void uffd_sigbus_test_common(bool
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
- while (!ready_for_fork)
- ; /* Wait for the poll_thread to start executing before forking */
+ /* Wait for child thread to start before forking */
+ pthread_barrier_wait(&ready_for_fork);
+ pthread_barrier_destroy(&ready_for_fork);
pid = fork();
if (pid < 0)
@@ -833,7 +834,7 @@ static void uffd_events_test_common(bool
char c;
struct uffd_args args = { 0 };
- ready_for_fork = false;
+ pthread_barrier_init(&ready_for_fork, NULL, 2);
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
if (uffd_register(uffd, area_dst, nr_pages * page_size,
@@ -844,8 +845,9 @@ static void uffd_events_test_common(bool
if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args))
err("uffd_poll_thread create");
- while (!ready_for_fork)
- ; /* Wait for the poll_thread to start executing before forking */
+ /* Wait for child thread to start before forking */
+ pthread_barrier_wait(&ready_for_fork);
+ pthread_barrier_destroy(&ready_for_fork);
pid = fork();
if (pid < 0)
_
Patches currently in -mm which might be from edliaw(a)google.com are
The quilt patch titled
Subject: fat: fix uninitialized variable
has been removed from the -mm tree. Its filename was
fat-fix-uninitialized-variable.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
Subject: fat: fix uninitialized variable
Date: Fri, 04 Oct 2024 15:03:49 +0900
syszbot produced this with a corrupted fs image. In theory, however an IO
error would trigger this also.
This affects just an error report, so should not be a serious error.
Link: https://lkml.kernel.org/r/87r08wjsnh.fsf@mail.parknet.co.jp
Link: https://lkml.kernel.org/r/66ff2c95.050a0220.49194.03e9.GAE@google.com
Signed-off-by: OGAWA Hirofumi <hirofumi(a)mail.parknet.co.jp>
Reported-by: syzbot+ef0d7bc412553291aa86(a)syzkaller.appspotmail.com
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/fat/namei_vfat.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/fat/namei_vfat.c~fat-fix-uninitialized-variable
+++ a/fs/fat/namei_vfat.c
@@ -1037,7 +1037,7 @@ error_inode:
if (corrupt < 0) {
fat_fs_error(new_dir->i_sb,
"%s: Filesystem corrupted (i_pos %lld)",
- __func__, sinfo.i_pos);
+ __func__, new_i_pos);
}
goto out;
}
_
Patches currently in -mm which might be from hirofumi(a)mail.parknet.co.jp are
The LG Gram Pro 16 2-in-1 (2024) the 16T90SP has its keybopard IRQ (1)
described as ActiveLow in the DSDT, which the kernel overrides to EdgeHigh
which breaks the keyboard.
Add the 16T90SP to the irq1_level_low_skip_override[] quirk table to fix
this.
Reported-by: Dirk Holten <dirk.holten(a)gmx.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219382
Cc: stable(a)vger.kernel.org
Suggested-by: Dirk Holten <dirk.holten(a)gmx.de>
Signed-off-by: Christian Heusel <christian(a)heusel.eu>
---
Note that I do not have the relevant hardware since I'm sending in this
quirk at the request of someone else.
Also does this change need a "Fixes: ..." tag?
---
drivers/acpi/resource.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 129bceb1f4a27df93439bcefdb27fd9c91258028..dd6249fb76c24f08db4149883be4548130d0ef1e 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -502,6 +502,11 @@ static const struct dmi_system_id irq1_level_low_skip_override[] = {
DMI_MATCH(DMI_SYS_VENDOR, "LG Electronics"),
DMI_MATCH(DMI_BOARD_NAME, "17U70P"),
},
+ /* LG Electronics 16T90SP */
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "LG Electronics"),
+ DMI_MATCH(DMI_BOARD_NAME, "16T90SP"),
+ },
},
{ }
};
---
base-commit: 8e929cb546ee42c9a61d24fae60605e9e3192354
change-id: 20241016-lg-gram-pro-keyboard-9a9d8b9aa647
Best regards,
--
Christian Heusel <christian(a)heusel.eu>
There is a probability that the host machine will also restart
when the virtual machine is restarting.
Commit ad362fe07fec ("KVM: arm64: vgic-its: Avoid potential UAF
in LPI translation cache") released the reference count of an IRQ
when it shouldn't have. This led to a situation where, when the
system finally released the IRQ, it found that the structure had
already been freed, triggering a
'refcount_t: underflow; use-after-free' error.
In fact, the function "vgic_put_irq" should be called by
"vgic_its_inject_cached_translation" instead of
"vgic_its_trigger_msi".
Call trace:
its_free_ite+0x90/0xa0
vgic_its_free_device+0x3c/0xa0
vgic_its_destroy+0x4c/0xb8
kvm_put_kvm+0x214/0x358
kvm_vcpu_release+0x24/0x38
__fput+0x84/0x278
____fput+0x20/0x30
task_work_run+0xcc/0x190
do_exit+0x36c/0xa88
do_group_exit+0x4c/0xb8
__arm64_sys_exit_group+0x24/0x28
invoke_syscall+0x54/0x120
el0_svc_common.constprop.4+0x16c/0x1f0
do_el0_svc+0x34/0xb0
el0_svc+0x1c/0x28
el0_sync_handler+0x8c/0xb0
el0_sync+0x148/0x180
Fixes: ad362fe07fec ("KVM: arm64: vgic-its: Avoid potential UAF in LPI translation cache")
Cc: stable(a)vger.kernel.org
Signed-off-by: Wenyao Hai <haiwenyao(a)uniontech.com>
Signed-off-by: WangYuli <wangyuli(a)uniontech.com>
---
arch/arm64/kvm/vgic/vgic-its.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index ba945ba78cc7..fb5f57cbab42 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -679,6 +679,7 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
vgic_queue_irq_unlock(kvm, irq, flags);
+ vgic_put_irq(kvm, irq);
return 0;
}
@@ -697,7 +698,6 @@ int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
vgic_queue_irq_unlock(kvm, irq, flags);
- vgic_put_irq(kvm, irq);
return 0;
}
--
2.45.2
With these features are enabled, the EEVDF scheduler introduces a large
performance degradation, observed in multiple database tests on kernel
versions using EEVDF, across multiple architectures (x86, aarch64, amd64)
and CPU generations.
Disable the features to minimize default performance impact.
Cc: <stable(a)vger.kernel.org> # 6.6.x
Fixes: 86bfbb7ce4f6 ("sched/fair: Add lag based placement")
Fixes: 63304558ba5d ("sched/eevdf: Curb wakeup-preemption")
Signed-off-by: Cristian Prundeanu <cpru(a)amazon.com>
---
kernel/sched/features.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3d331dd2d8f..8a5ca80665b3 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -4,7 +4,7 @@
* Using the avg_vruntime, do the right thing and preserve lag across
* sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled.
*/
-SCHED_FEAT(PLACE_LAG, true)
+SCHED_FEAT(PLACE_LAG, false)
/*
* Give new tasks half a slice to ease into the competition.
*/
@@ -17,7 +17,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
* Inhibit (wakeup) preemption until the current task has either matched the
* 0-lag point or until is has exhausted it's slice.
*/
-SCHED_FEAT(RUN_TO_PARITY, true)
+SCHED_FEAT(RUN_TO_PARITY, false)
/*
* Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
* current.
--
2.40.1
Hi, Conor
Thanks for your patch.
> From: Conor Dooley <conor.dooley(a)microchip.com>
>
> Aurelien reported probe failures due to the csi node being enabled without
> having a camera attached to it. A camera was in the initial submissions, but
> was removed from the dts, as it had not actually been present on the board,
> but was from an addon board used by the developer of the relevant drivers.
> The non-camera pipeline nodes were not disabled when this happened and
> the probe failures are problematic for Debian. Disable them.
>
> CC: stable(a)vger.kernel.org
> Fixes: 28ecaaa5af192 ("riscv: dts: starfive: jh7110: Add camera subsystem
> nodes")
Here you write it in 13 characters, should be "Fixes: 28ecaaa5af19 ..."
Best Regards
Changhuang.
> Closes: https://lore.kernel.org/all/Zw1-vcN4CoVkfLjU@aurel32.net/
> Reported-by: Aurelien Jarno <aurelien(a)aurel32.net>
> Signed-off-by: Conor Dooley <conor.dooley(a)microchip.com>
> ---
> CC: Emil Renner Berthing <kernel(a)esmil.dk>
> CC: Rob Herring <robh(a)kernel.org>
> CC: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
> CC: Conor Dooley <conor+dt(a)kernel.org>
> CC: Changhuang Liang <changhuang.liang(a)starfivetech.com>
> CC: devicetree(a)vger.kernel.org
> CC: linux-riscv(a)lists.infradead.org
> CC: linux-kernel(a)vger.kernel.org
> ---
> arch/riscv/boot/dts/starfive/jh7110-common.dtsi | 2 --
> 1 file changed, 2 deletions(-)
>
> diff --git a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
> b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
> index c7771b3b64758..d6c55f1cc96a9 100644
> --- a/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
> +++ b/arch/riscv/boot/dts/starfive/jh7110-common.dtsi
> @@ -128,7 +128,6 @@ &camss {
> assigned-clocks = <&ispcrg JH7110_ISPCLK_DOM4_APB_FUNC>,
> <&ispcrg JH7110_ISPCLK_MIPI_RX0_PXL>;
> assigned-clock-rates = <49500000>, <198000000>;
> - status = "okay";
>
> ports {
> #address-cells = <1>;
> @@ -151,7 +150,6 @@ camss_from_csi2rx: endpoint { &csi2rx {
> assigned-clocks = <&ispcrg JH7110_ISPCLK_VIN_SYS>;
> assigned-clock-rates = <297000000>;
> - status = "okay";
>
> ports {
> #address-cells = <1>;
> --
> 2.45.2
The patch titled
Subject: ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overflow.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Edward Adam Davis <eadavis(a)qq.com>
Subject: ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow
Date: Wed, 16 Oct 2024 19:43:47 +0800
Syzbot reported a kernel BUG in ocfs2_truncate_inline. There are two
reasons for this: first, the parameter value passed is greater than
ocfs2_max_inline_data_with_xattr, second, the start and end parameters of
ocfs2_truncate_inline are "unsigned int".
So, we need to add a sanity check for byte_start and byte_len right before
ocfs2_truncate_inline() in ocfs2_remove_inode_range(), if they are greater
than ocfs2_max_inline_data_with_xattr return -EINVAL.
Link: https://lkml.kernel.org/r/tencent_D48DB5122ADDAEDDD11918CFB68D93258C07@qq.c…
Fixes: 1afc32b95233 ("ocfs2: Write support for inline data")
Signed-off-by: Edward Adam Davis <eadavis(a)qq.com>
Reported-by: syzbot+81092778aac03460d6b7(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=81092778aac03460d6b7
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/file.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/fs/ocfs2/file.c~ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overflow
+++ a/fs/ocfs2/file.c
@@ -1784,6 +1784,14 @@ int ocfs2_remove_inode_range(struct inod
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
+
+ if (byte_start > id_count || byte_start + byte_len > id_count) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
byte_start + byte_len, 0);
if (ret) {
_
Patches currently in -mm which might be from eadavis(a)qq.com are
ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overflow.patch
Hey,
Would you be interested in acquiring the attendees list of NEPCON NAGOYA 2024?
List contains: Names, Titles, Phone Numbers, Company Details, and more…
Interested? Let me know so that I’ll send you the pricing for the same.
Kind Regards,
Jane Wilkins
Marketing Executive
If you do not wish to receive our emails, please reply with "Not Interested."
Jeongjun Park <aha310510(a)gmail.com> wrote:
>
> I got the following KCSAN report during syzbot testing:
>
> ==================================================================
> BUG: KCSAN: data-race in generic_fillattr / inode_set_ctime_current
>
> write to 0xffff888102eb3260 of 4 bytes by task 6565 on cpu 1:
> inode_set_ctime_to_ts include/linux/fs.h:1638 [inline]
> inode_set_ctime_current+0x169/0x1d0 fs/inode.c:2626
> shmem_mknod+0x117/0x180 mm/shmem.c:3443
> shmem_create+0x34/0x40 mm/shmem.c:3497
> lookup_open fs/namei.c:3578 [inline]
> open_last_lookups fs/namei.c:3647 [inline]
> path_openat+0xdbc/0x1f00 fs/namei.c:3883
> do_filp_open+0xf7/0x200 fs/namei.c:3913
> do_sys_openat2+0xab/0x120 fs/open.c:1416
> do_sys_open fs/open.c:1431 [inline]
> __do_sys_openat fs/open.c:1447 [inline]
> __se_sys_openat fs/open.c:1442 [inline]
> __x64_sys_openat+0xf3/0x120 fs/open.c:1442
> x64_sys_call+0x1025/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:258
> do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> read to 0xffff888102eb3260 of 4 bytes by task 3498 on cpu 0:
> inode_get_ctime_nsec include/linux/fs.h:1623 [inline]
> inode_get_ctime include/linux/fs.h:1629 [inline]
> generic_fillattr+0x1dd/0x2f0 fs/stat.c:62
> shmem_getattr+0x17b/0x200 mm/shmem.c:1157
> vfs_getattr_nosec fs/stat.c:166 [inline]
> vfs_getattr+0x19b/0x1e0 fs/stat.c:207
> vfs_statx_path fs/stat.c:251 [inline]
> vfs_statx+0x134/0x2f0 fs/stat.c:315
> vfs_fstatat+0xec/0x110 fs/stat.c:341
> __do_sys_newfstatat fs/stat.c:505 [inline]
> __se_sys_newfstatat+0x58/0x260 fs/stat.c:499
> __x64_sys_newfstatat+0x55/0x70 fs/stat.c:499
> x64_sys_call+0x141f/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:263
> do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> value changed: 0x2755ae53 -> 0x27ee44d3
>
> Since there is no special protection when shmem_getattr() calls
> generic_fillattr(), data-race occurs by functions such as shmem_unlink()
> or shmem_mknod(). This can cause unexpected results, so commenting it out
> is not enough.
>
> Therefore, when calling generic_fillattr() from shmem_getattr(), it is
> appropriate to protect the inode using inode_lock_shared() and
> inode_unlock_shared() to prevent data-race.
>
Cc: stable(a)vger.kernel.org
I think this patch should be applied from next rc version and also stable
version. When calling generic_fillattr(), if you don't hold read lock,
data-race will occur in inode member variables, which can cause unexpected
behavior. This problem is also present in several stable versions, so I think
it should be fixed as soon as possible.
Regards,
Jeongjun Park
> Reported-by: syzbot <syzkaller(a)googlegroups.com>
> Fixes: 44a30220bc0a ("shmem: recalculate file inode when fstat")
> Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
> ---
> mm/shmem.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 5a77acf6ac6a..9beeb47c3743 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1154,7 +1154,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
> stat->attributes_mask |= (STATX_ATTR_APPEND |
> STATX_ATTR_IMMUTABLE |
> STATX_ATTR_NODUMP);
> + inode_lock_shared(inode);
> generic_fillattr(idmap, request_mask, inode, stat);
> + inode_unlock_shared(inode);
>
> if (shmem_is_huge(inode, 0, false, NULL, 0))
> stat->blksize = HPAGE_PMD_SIZE;
> --
The patch titled
Subject: mm/gup: stop leaking pinned pages in low memory conditions
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-gup-stop-leaking-pinned-pages-in-low-memory-conditions.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: John Hubbard <jhubbard(a)nvidia.com>
Subject: mm/gup: stop leaking pinned pages in low memory conditions
Date: Wed, 16 Oct 2024 13:22:42 -0700
If a driver tries to call any of the pin_user_pages*(FOLL_LONGTERM) family
of functions, and requests "too many" pages, then the call will
erroneously leave pages pinned. This is visible in user space as an
actual memory leak.
Repro is trivial: just make enough pin_user_pages(FOLL_LONGTERM) calls to
exhaust memory.
The root cause of the problem is this sequence, within
__gup_longterm_locked():
__get_user_pages_locked()
rc = check_and_migrate_movable_pages()
...which gets retried in a loop. The loop error handling is incomplete,
clearly due to a somewhat unusual and complicated tri-state error API.
But anyway, if -ENOMEM, or in fact, any unexpected error is returned from
check_and_migrate_movable_pages(), then __gup_longterm_locked() happily
returns the error, while leaving the pages pinned.
In the failed case, which is an app that requests (via a device driver)
30720000000 bytes to be pinned, and then exits, I see this:
$ grep foll /proc/vmstat
nr_foll_pin_acquired 7502048
nr_foll_pin_released 2048
And after applying this patch, it returns to balanced pins:
$ grep foll /proc/vmstat
nr_foll_pin_acquired 7502048
nr_foll_pin_released 7502048
Fix this by unpinning the pages that __get_user_pages_locked() has
pinned, in such error cases.
Link: https://lkml.kernel.org/r/20241016202242.456953-1-jhubbard@nvidia.com
Fixes: 24a95998e9ba ("mm/gup.c: simplify and fix check_and_migrate_movable_pages() return codes")
Signed-off-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Shigeru Yoshida <syoshida(a)redhat.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/gup.c | 11 +++++++++++
1 file changed, 11 insertions(+)
--- a/mm/gup.c~mm-gup-stop-leaking-pinned-pages-in-low-memory-conditions
+++ a/mm/gup.c
@@ -2492,6 +2492,17 @@ static long __gup_longterm_locked(struct
/* FOLL_LONGTERM implies FOLL_PIN */
rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
+
+ /*
+ * The __get_user_pages_locked() call happens before we know
+ * that whether it's possible to successfully complete the whole
+ * operation. To compensate for this, if we get an unexpected
+ * error (such as -ENOMEM) then we must unpin everything, before
+ * erroring out.
+ */
+ if (rc != -EAGAIN && rc != 0)
+ unpin_user_pages(pages, nr_pinned_pages);
+
} while (rc == -EAGAIN);
memalloc_pin_restore(flags);
return rc ? rc : nr_pinned_pages;
_
Patches currently in -mm which might be from jhubbard(a)nvidia.com are
mm-gup-stop-leaking-pinned-pages-in-low-memory-conditions.patch
kaslr-rename-physmem_end-and-physmem_end-to-direct_map_physmem_end.patch
The patch titled
Subject: x86/traps: move kmsan check after instrumentation_begin
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
x86-traps-move-kmsan-check-after-instrumentation_begin.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Sabyrzhan Tasbolatov <snovitoll(a)gmail.com>
Subject: x86/traps: move kmsan check after instrumentation_begin
Date: Wed, 16 Oct 2024 20:24:07 +0500
During x86_64 kernel build with CONFIG_KMSAN, the objtool warns following:
AR built-in.a
AR vmlinux.a
LD vmlinux.o
vmlinux.o: warning: objtool: handle_bug+0x4: call to
kmsan_unpoison_entry_regs() leaves .noinstr.text section
OBJCOPY modules.builtin.modinfo
GEN modules.builtin
MODPOST Module.symvers
CC .vmlinux.export.o
Moving kmsan_unpoison_entry_regs() _after_ instrumentation_begin() fixes
the warning.
There is decode_bug(regs->ip, &imm) is left before KMSAN unpoisoining, but
it has the return condition and if we include it after
instrumentation_begin() it results the warning "return with
instrumentation enabled", hence, I'm concerned that regs will not be KMSAN
unpoisoned if `ud_type == BUG_NONE` is true.
Link: https://lkml.kernel.org/r/20241016152407.3149001-1-snovitoll@gmail.com
Fixes: ba54d194f8da ("x86/traps: avoid KMSAN bugs originating from handle_bug()")
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll(a)gmail.com>
Reviewed-by: Alexander Potapenko <glider(a)google.com>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/x86/kernel/traps.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/arch/x86/kernel/traps.c~x86-traps-move-kmsan-check-after-instrumentation_begin
+++ a/arch/x86/kernel/traps.c
@@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt
int ud_type;
u32 imm;
- /*
- * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
- * is a rare case that uses @regs without passing them to
- * irqentry_enter().
- */
- kmsan_unpoison_entry_regs(regs);
ud_type = decode_bug(regs->ip, &imm);
if (ud_type == BUG_NONE)
return handled;
@@ -276,6 +270,12 @@ static noinstr bool handle_bug(struct pt
*/
instrumentation_begin();
/*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
+ /*
* Since we're emulating a CALL with exceptions, restore the interrupt
* state to what it was at the exception site.
*/
_
Patches currently in -mm which might be from snovitoll(a)gmail.com are
x86-traps-move-kmsan-check-after-instrumentation_begin.patch
mm-kasan-kmsan-copy_from-to_kernel_nofault.patch
-Wenum-enum-conversion and -Wenum-compare-conditional were strengthened
in clang-19 to warn in C mode, which caused the kernel to move them to
W=1 in commit 75b5ab134bb5 ("kbuild: Move
-Wenum-{compare-conditional,enum-conversion} into W=1") because there
were numerous instances of each that would break builds with -Werror.
Unfortunately, this is not a full solution, as more and more developers,
subsystems, and distributors are building with W=1 as well, so they
continue to see the numerous instances of these warnings.
Since the move to W=1, there have not been many new instances that have
appeared through various build reports and the ones that have appeared
seem to be following similar existing patterns, suggesting that most
instances of these warnings will not be real issues. The only
alternatives for silencing these warnings are adding casts (which is
generally seen as an ugly practice) or refactoring the enums to macro
defines or a unified enum (which may be undesirable because of type
safety in other parts of the code).
Disable the warnings altogether so that W=1 users do not see them.
Cc: stable(a)vger.kernel.org
Fixes: 75b5ab134bb5 ("kbuild: Move -Wenum-{compare-conditional,enum-conversion} into W=1")
Link: https://lore.kernel.org/ZwRA9SOcOjjLJcpi@google.com/
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
I am open to discussion around whether or not to disable this entirely
but I think it needs to be out of W=1.
---
scripts/Makefile.extrawarn | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index 1d13cecc7cc7808610e635ddc03476cf92b3a8c1..3f124c80bee5133899047c07bc05e3173ee4c7f0 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -31,6 +31,8 @@ KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds
ifdef CONFIG_CC_IS_CLANG
# The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable.
KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-enum-compare-conditional
+KBUILD_CFLAGS += -Wno-enum-enum-conversion
else
# gcc inanely warns about local variables called 'main'
@@ -129,8 +131,6 @@ endif
KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare
KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access)
-KBUILD_CFLAGS += -Wno-enum-compare-conditional
-KBUILD_CFLAGS += -Wno-enum-enum-conversion
endif
endif
---
base-commit: 8e929cb546ee42c9a61d24fae60605e9e3192354
change-id: 20241016-disable-two-clang-enum-warnings-e7994d44f948
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
Add missing Broadcast_AND region to the LLCC block for x1e80100,
as the LLCC version on this platform is 4.1 and it provides the region.
This also fixes the following error caused by the missing region:
[ 3.797768] qcom-llcc 25000000.system-cache-controller: error -EINVAL: invalid resource (null)
This error started showing up only after the new regmap region called
Broadcast_AND that has been added to the llcc-qcom driver.
Cc: stable(a)vger.kernel.org # 6.11: 055afc34fd21: soc: qcom: llcc: Add regmap for Broadcast_AND region
Fixes: af16b00578a7 ("arm64: dts: qcom: Add base X1E80100 dtsi and the QCP dts")
Signed-off-by: Abel Vesa <abel.vesa(a)linaro.org>
---
Changes in v2:
- fixed subject line to say x1e80100 instead of sm8450
- mentioned the reason why the new error is showing up
and how it is related to the llcc-qcom driver
- cc'ed stable with patch dependency for cherry-picking
- Link to v1: https://lore.kernel.org/r/20240718-x1e80100-dts-llcc-add-broadcastand_regio…
---
arch/arm64/boot/dts/qcom/x1e80100.dtsi | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi
index 0e6802c1d2d8375987c614ec69c440e2f38d25c6..fbf1acf8b0d84a2d2c723785242a65f47e63340b 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi
+++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi
@@ -6093,7 +6093,8 @@ system-cache-controller@25000000 {
<0 0x25a00000 0 0x200000>,
<0 0x25c00000 0 0x200000>,
<0 0x25e00000 0 0x200000>,
- <0 0x26000000 0 0x200000>;
+ <0 0x26000000 0 0x200000>,
+ <0 0x26200000 0 0x200000>;
reg-names = "llcc0_base",
"llcc1_base",
"llcc2_base",
@@ -6102,7 +6103,8 @@ system-cache-controller@25000000 {
"llcc5_base",
"llcc6_base",
"llcc7_base",
- "llcc_broadcast_base";
+ "llcc_broadcast_base",
+ "llcc_broadcast_and_base";
interrupts = <GIC_SPI 266 IRQ_TYPE_LEVEL_HIGH>;
};
---
base-commit: d61a00525464bfc5fe92c6ad713350988e492b88
change-id: 20240718-x1e80100-dts-llcc-add-broadcastand_region-797413ee2a8f
Best regards,
--
Abel Vesa <abel.vesa(a)linaro.org>
This problem reported by Clement LE GOFFIC manifest when
using CONFIG_KASAN_IN_VMALLOC and VMAP_STACK:
https://lore.kernel.org/linux-arm-kernel/a1a1d062-f3a2-4d05-9836-3b098de9db…
After some analysis it seems we are missing to sync the
VMALLOC shadow memory in top level PGD to all CPUs.
Add some code to perform this sync, and the bug appears
to go away.
As suggested by Ard, also perform a dummy read from the
shadow memory of the new VMAP_STACK in the low level
assembly.
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
---
Linus Walleij (2):
ARM: ioremap: Flush PGDs for VMALLOC shadow
ARM: entry: Do a dummy read from VMAP shadow
arch/arm/kernel/entry-armv.S | 8 ++++++++
arch/arm/mm/ioremap.c | 7 +++++++
2 files changed, 15 insertions(+)
---
base-commit: 9852d85ec9d492ebef56dc5f229416c925758edc
change-id: 20241015-arm-kasan-vmalloc-crash-fcbd51416457
Best regards,
--
Linus Walleij <linus.walleij(a)linaro.org>
When copying a namespace we won't have added the new copy into the
namespace rbtree until after the copy succeeded. Calling free_mnt_ns()
will try to remove the copy from the rbtree which is invalid. Simply
free the namespace skeleton directly.
Fixes: 1901c92497bd ("fs: keep an index of current mount namespaces")
Cc: stable(a)vger.kernel.org # v6.11+
Reported-by: Brad Spengler <spender(a)grsecurity.net>
Tested-by: Brad Spengler <spender(a)grsecurity.net>
Suggested-by: Brad Spengler <spender(a)grsecurity.net>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
---
In vfs.fixes unless I hear objections.
---
fs/namespace.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 93c377816d75..d26f5e6d2ca3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3944,7 +3944,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
- free_mnt_ns(new_ns);
+ ns_free_inum(&new_ns->ns);
+ dec_mnt_namespaces(new_ns->ucounts);
+ mnt_ns_release(new_ns);
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
--
2.45.2
iowarrior_read() uses the iowarrior dev structure, but does not use any
lock on the structure. This can cause various bugs including data-races,
so it is more appropriate to use a mutex lock to safely protect the
iowarrior dev structure. When using a mutex lock, you should split the
branch to prevent blocking when the O_NONBLOCK flag is set.
In addition, it is unnecessary to check for NULL on the iowarrior dev
structure obtained by reading file->private_data. Therefore, it is
better to remove the check.
Cc: stable(a)vger.kernel.org
Fixes: 946b960d13c1 ("USB: add driver for iowarrior devices.")
Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
---
v1 -> v2: Added cc tag and change log
drivers/usb/misc/iowarrior.c | 46 ++++++++++++++++++++++++++++--------
1 file changed, 36 insertions(+), 10 deletions(-)
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 6d28467ce352..a513766b4985 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -277,28 +277,45 @@ static ssize_t iowarrior_read(struct file *file, char __user *buffer,
struct iowarrior *dev;
int read_idx;
int offset;
+ int retval;
dev = file->private_data;
+ if (file->f_flags & O_NONBLOCK) {
+ retval = mutex_trylock(&dev->mutex);
+ if (!retval)
+ return -EAGAIN;
+ } else {
+ retval = mutex_lock_interruptible(&dev->mutex);
+ if (retval)
+ return -ERESTARTSYS;
+ }
+
/* verify that the device wasn't unplugged */
- if (!dev || !dev->present)
- return -ENODEV;
+ if (!dev->present) {
+ retval = -ENODEV;
+ goto exit;
+ }
dev_dbg(&dev->interface->dev, "minor %d, count = %zd\n",
dev->minor, count);
/* read count must be packet size (+ time stamp) */
if ((count != dev->report_size)
- && (count != (dev->report_size + 1)))
- return -EINVAL;
+ && (count != (dev->report_size + 1))) {
+ retval = -EINVAL;
+ goto exit;
+ }
/* repeat until no buffer overrun in callback handler occur */
do {
atomic_set(&dev->overflow_flag, 0);
if ((read_idx = read_index(dev)) == -1) {
/* queue empty */
- if (file->f_flags & O_NONBLOCK)
- return -EAGAIN;
+ if (file->f_flags & O_NONBLOCK) {
+ retval = -EAGAIN;
+ goto exit;
+ }
else {
//next line will return when there is either new data, or the device is unplugged
int r = wait_event_interruptible(dev->read_wait,
@@ -309,28 +326,37 @@ static ssize_t iowarrior_read(struct file *file, char __user *buffer,
-1));
if (r) {
//we were interrupted by a signal
- return -ERESTART;
+ retval = -ERESTART;
+ goto exit;
}
if (!dev->present) {
//The device was unplugged
- return -ENODEV;
+ retval = -ENODEV;
+ goto exit;
}
if (read_idx == -1) {
// Can this happen ???
- return 0;
+ retval = 0;
+ goto exit;
}
}
}
offset = read_idx * (dev->report_size + 1);
if (copy_to_user(buffer, dev->read_queue + offset, count)) {
- return -EFAULT;
+ retval = -EFAULT;
+ goto exit;
}
} while (atomic_read(&dev->overflow_flag));
read_idx = ++read_idx == MAX_INTERRUPT_BUFFER ? 0 : read_idx;
atomic_set(&dev->read_idx, read_idx);
+ mutex_unlock(&dev->mutex);
return count;
+
+exit:
+ mutex_unlock(&dev->mutex);
+ return retval;
}
/*
--
On 11.10.24 10:48, Joel GUITTET wrote:
>
> I faced some issue related to scaling frequency on ZynqMP device
> using v5.15.167 kernel. As an exemple setting the scaling frequency
> below show it's not properly set:
>
> cat /sys/devices/system/cpu/cpufreq/policy0/
> scaling_available_frequencies 299999 399999 599999 1199999
>
> echo 399999 > /sys/devices/system/cpu/cpufreq/policy0/
> scaling_setspeed
>
> cat /sys/devices/system/cpu/cpufreq/policy0/scaling_cur_freq 399999
>
> cat /sys/devices/system/cpu/cpufreq/policy0/cpuinfo_cur_freq 299999
> ====> Should be 399999
>
> After analysis of this issue with the help of Xilinx, it appears
> that a commit was backported on the 5.15.y branch, but probably it
> should not, or not as is. The commit is
> 9117fc44fd3a9538261e530ba5a022dfc9519620 modifying drivers/clk/
> zynqmp/divider.c.
FWIW, that is 1fe15be1fb6135 ("drivers: clk: zynqmp: update divider
round rate logic") [v6.8-rc1].
> Is anybody reading this message able to answer why it was
> backported ?
Looks like because it fixes a bug. I CCed the original author and those
that handled the patch, maybe they can help us out and tell us what's
the best strategy forward here.
> The information I have until now is that it is intended
> recent kernel version. Dependencies for this modification is
> currently under clarification with Xilinx (maybe another commit to
> backport).
>
> By the way, reverting this commit fix the issue shown above.
Does 6.12-rc work fine for you? Because if not, we should fix the
problem there.
Ciao, Thorsten
The NVMe regulator has been left enabled by the boot firmware. Mark it
as such to avoid disabling the regulator temporarily during boot.
Fixes: eb57cbe730d1 ("arm64: dts: qcom: x1e80100: Describe the PCIe 6a resources")
Cc: stable(a)vger.kernel.org # 6.11
Cc: Abel Vesa <abel.vesa(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/x1e80100-qcp.dts | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts
index 1c3a6a7b3ed6..5ef030c60abe 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-qcp.dts
@@ -253,6 +253,8 @@ vreg_nvme: regulator-nvme {
pinctrl-names = "default";
pinctrl-0 = <&nvme_reg_en>;
+
+ regulator-boot-on;
};
};
--
2.45.2
The NVMe regulator has been left enabled by the boot firmware. Mark it
as such to avoid disabling the regulator temporarily during boot.
Fixes: d0e2f8f62dff ("arm64: dts: qcom: Add device tree for ASUS Vivobook S 15")
Cc: stable(a)vger.kernel.org # 6.11
Cc: Xilin Wu <wuxilin123(a)gmail.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts
index 20616bd4aa6c..fb4a48a1e2a8 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-asus-vivobook-s15.dts
@@ -134,6 +134,8 @@ vreg_nvme: regulator-nvme {
pinctrl-0 = <&nvme_reg_en>;
pinctrl-names = "default";
+
+ regulator-boot-on;
};
};
--
2.45.2
The NVMe regulator has been left enabled by the boot firmware. Mark it
as such to avoid disabling the regulator temporarily during boot.
Fixes: eb57cbe730d1 ("arm64: dts: qcom: x1e80100: Describe the PCIe 6a resources")
Cc: stable(a)vger.kernel.org # 6.11
Cc: Abel Vesa <abel.vesa(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
arch/arm64/boot/dts/qcom/x1e80100-crd.dts | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts
index 10b28d870f08..5d2eec8590ce 100644
--- a/arch/arm64/boot/dts/qcom/x1e80100-crd.dts
+++ b/arch/arm64/boot/dts/qcom/x1e80100-crd.dts
@@ -300,6 +300,8 @@ vreg_nvme: regulator-nvme {
pinctrl-names = "default";
pinctrl-0 = <&nvme_reg_en>;
+
+ regulator-boot-on;
};
vreg_wwan: regulator-wwan {
--
2.45.2
The environ pointer itself is never NULL, this is guaranteed by crt.h.
However if the environment is empty, environ will point to a NULL
pointer.
While this case will be checked by the loop later, this only happens
after the first loop iteration.
To avoid reading invalid memory inside the loop, fix the test that
checks for an empty environment.
Fixes: 077d0a392446 ("tools/nolibc/stdlib: add a simple getenv() implementation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
---
tools/include/nolibc/stdlib.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h
index 75aa273c23a6153db6a32facaea16457a522703b..c31967378cf1f699283d801487c1a91d17e4d1ce 100644
--- a/tools/include/nolibc/stdlib.h
+++ b/tools/include/nolibc/stdlib.h
@@ -90,7 +90,7 @@ char *getenv(const char *name)
{
int idx, i;
- if (environ) {
+ if (*environ) {
for (idx = 0; environ[idx]; idx++) {
for (i = 0; name[i] && name[i] == environ[idx][i];)
i++;
---
base-commit: 2f87d0916ce0d2925cedbc9e8f5d6291ba2ac7b2
change-id: 20241016-nolibc-getenv-f99f4d3df2cd
Best regards,
--
Thomas Weißschuh <thomas.weissschuh(a)linutronix.de>
Treat each completed full size write to /dev/ttyDBC0 as a separate usb
transfer. Make sure the size of the TRBs matches the size of the tty
write by first queuing as many max packet size TRBs as possible up to
the last TRB which will be cut short to match the size of the tty write.
This solves an issue where userspace writes several transfers back to
back via /dev/ttyDBC0 into a kfifo before dbgtty can find available
request to turn that kfifo data into TRBs on the transfer ring.
The boundary between transfer was lost as xhci-dbgtty then turned
everyting in the kfifo into as many 'max packet size' TRBs as possible.
DbC would then send more data to the host than intended for that
transfer, causing host to issue a babble error.
Refuse to write more data to kfifo until previous tty write data is
turned into properly sized TRBs with data size boundaries matching tty
write size
Tested-by: Uday M Bhat <uday.m.bhat(a)intel.com>
Tested-by: Łukasz Bartosik <ukaszb(a)chromium.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-dbgcap.h | 1 +
drivers/usb/host/xhci-dbgtty.c | 55 ++++++++++++++++++++++++++++++----
2 files changed, 51 insertions(+), 5 deletions(-)
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 8ec813b6e9fd..9dc8f4d8077c 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -110,6 +110,7 @@ struct dbc_port {
struct tasklet_struct push;
struct list_head write_pool;
+ unsigned int tx_boundary;
bool registered;
};
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index b8e78867e25a..d719c16ea30b 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -24,6 +24,29 @@ static inline struct dbc_port *dbc_to_port(struct xhci_dbc *dbc)
return dbc->priv;
}
+static unsigned int
+dbc_kfifo_to_req(struct dbc_port *port, char *packet)
+{
+ unsigned int len;
+
+ len = kfifo_len(&port->port.xmit_fifo);
+
+ if (len == 0)
+ return 0;
+
+ len = min(len, DBC_MAX_PACKET);
+
+ if (port->tx_boundary)
+ len = min(port->tx_boundary, len);
+
+ len = kfifo_out(&port->port.xmit_fifo, packet, len);
+
+ if (port->tx_boundary)
+ port->tx_boundary -= len;
+
+ return len;
+}
+
static int dbc_start_tx(struct dbc_port *port)
__releases(&port->port_lock)
__acquires(&port->port_lock)
@@ -36,7 +59,7 @@ static int dbc_start_tx(struct dbc_port *port)
while (!list_empty(pool)) {
req = list_entry(pool->next, struct dbc_request, list_pool);
- len = kfifo_out(&port->port.xmit_fifo, req->buf, DBC_MAX_PACKET);
+ len = dbc_kfifo_to_req(port, req->buf);
if (len == 0)
break;
do_tty_wake = true;
@@ -200,14 +223,32 @@ static ssize_t dbc_tty_write(struct tty_struct *tty, const u8 *buf,
{
struct dbc_port *port = tty->driver_data;
unsigned long flags;
+ unsigned int written = 0;
spin_lock_irqsave(&port->port_lock, flags);
- if (count)
- count = kfifo_in(&port->port.xmit_fifo, buf, count);
- dbc_start_tx(port);
+
+ /*
+ * Treat tty write as one usb transfer. Make sure the writes are turned
+ * into TRB request having the same size boundaries as the tty writes.
+ * Don't add data to kfifo before previous write is turned into TRBs
+ */
+ if (port->tx_boundary) {
+ spin_unlock_irqrestore(&port->port_lock, flags);
+ return 0;
+ }
+
+ if (count) {
+ written = kfifo_in(&port->port.xmit_fifo, buf, count);
+
+ if (written == count)
+ port->tx_boundary = kfifo_len(&port->port.xmit_fifo);
+
+ dbc_start_tx(port);
+ }
+
spin_unlock_irqrestore(&port->port_lock, flags);
- return count;
+ return written;
}
static int dbc_tty_put_char(struct tty_struct *tty, u8 ch)
@@ -241,6 +282,10 @@ static unsigned int dbc_tty_write_room(struct tty_struct *tty)
spin_lock_irqsave(&port->port_lock, flags);
room = kfifo_avail(&port->port.xmit_fifo);
+
+ if (port->tx_boundary)
+ room = 0;
+
spin_unlock_irqrestore(&port->port_lock, flags);
return room;
--
2.25.1
The stream contex type (SCT) bitfield is used both in the stream context
data structure, and in the 'Set TR Dequeue pointer' command TRB.
In both cases it uses bits 3:1
The SCT_FOR_TRB(p) macro used to set the stream context type (SCT) field
for the 'Set TR Dequeue pointer' command TRB incorrectly shifts the value
1 bit left before masking the three bits.
Fix this by first masking and rshifting, just like the similar
SCT_FOR_CTX(p) macro does
This issue has not been visibile as the lost bit 3 is only used with
secondary stream arrays (SSA). Xhci driver currently only supports using
a primary stream array with Linear stream addressing.
Fixes: 95241dbdf828 ("xhci: Set SCT field for Set TR dequeue on streams")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 620502de971a..f0fb696d5619 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1001,7 +1001,7 @@ enum xhci_setup_dev {
/* Set TR Dequeue Pointer command TRB fields, 6.4.3.9 */
#define TRB_TO_STREAM_ID(p) ((((p) & (0xffff << 16)) >> 16))
#define STREAM_ID_FOR_TRB(p) ((((p)) & 0xffff) << 16)
-#define SCT_FOR_TRB(p) (((p) << 1) & 0x7)
+#define SCT_FOR_TRB(p) (((p) & 0x7) << 1)
/* Link TRB specific fields */
#define TRB_TC (1<<1)
--
2.25.1
The struct vchiq_arm_state 'platform_state' is currently allocated
dynamically using kzalloc(). Unfortunately, it is never freed and is
subjected to memory leaks in the error handling paths of the probe()
function.
To address the issue, use device resource management helper
devm_kzalloc(), to ensure cleanup after its allocation.
Fixes: 71bad7f08641 ("staging: add bcm2708 vchiq driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Umang Jain <umang.jain(a)ideasonboard.com>
Reviewed-by: Dan Carpenter <dan.carpenter(a)linaro.org>
---
drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index af623ad87c15..7ece82c361ee 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -285,7 +285,7 @@ vchiq_platform_init_state(struct vchiq_state *state)
{
struct vchiq_arm_state *platform_state;
- platform_state = kzalloc(sizeof(*platform_state), GFP_KERNEL);
+ platform_state = devm_kzalloc(state->dev, sizeof(*platform_state), GFP_KERNEL);
if (!platform_state)
return -ENOMEM;
--
2.45.2
The PLL checks are comparing 64 bit integers with 32 bit
ones. Depending on the values of the variables, this may
underflow.
Fix it ensuring that both sides of the expression are u64.
Fixes: 852b50aeed15 ("media: On Semi AR0521 sensor driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
---
drivers/media/i2c/ar0521.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/media/i2c/ar0521.c b/drivers/media/i2c/ar0521.c
index fc27238dd4d3..24873149096c 100644
--- a/drivers/media/i2c/ar0521.c
+++ b/drivers/media/i2c/ar0521.c
@@ -255,10 +255,10 @@ static u32 calc_pll(struct ar0521_dev *sensor, u32 freq, u16 *pre_ptr, u16 *mult
continue; /* Minimum value */
if (new_mult > 254)
break; /* Maximum, larger pre won't work either */
- if (sensor->extclk_freq * (u64)new_mult < AR0521_PLL_MIN *
+ if (sensor->extclk_freq * (u64)new_mult < (u64)AR0521_PLL_MIN *
new_pre)
continue;
- if (sensor->extclk_freq * (u64)new_mult > AR0521_PLL_MAX *
+ if (sensor->extclk_freq * (u64)new_mult > (u64)AR0521_PLL_MAX *
new_pre)
break; /* Larger pre won't work either */
new_pll = div64_round_up(sensor->extclk_freq * (u64)new_mult,
--
2.47.0
The error check logic at get_ctrl() is broken: if ptr_to_user()
fails to fill a control due to an error, no errors are returned
and v4l2_g_ctrl() returns success on a failed operation, which
may cause applications to fail.
Add an error check at get_ctrl() and ensure that it will
be returned to userspace without filling the control value if
get_ctrl() fails.
Fixes: 71c689dc2e73 ("media: v4l2-ctrls: split up into four source files")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
---
drivers/media/v4l2-core/v4l2-ctrls-api.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/media/v4l2-core/v4l2-ctrls-api.c b/drivers/media/v4l2-core/v4l2-ctrls-api.c
index e5a364efd5e6..be32dccf9830 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-api.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-api.c
@@ -753,9 +753,10 @@ static int get_ctrl(struct v4l2_ctrl *ctrl, struct v4l2_ext_control *c)
for (i = 0; i < master->ncontrols; i++)
cur_to_new(master->cluster[i]);
ret = call_op(master, g_volatile_ctrl);
- new_to_user(c, ctrl);
+ if (!ret)
+ ret = new_to_user(c, ctrl);
} else {
- cur_to_user(c, ctrl);
+ ret = cur_to_user(c, ctrl);
}
v4l2_ctrl_unlock(master);
return ret;
@@ -770,7 +771,10 @@ int v4l2_g_ctrl(struct v4l2_ctrl_handler *hdl, struct v4l2_control *control)
if (!ctrl || !ctrl->is_int)
return -EINVAL;
ret = get_ctrl(ctrl, &c);
- control->value = c.value;
+
+ if (!ret)
+ control->value = c.value;
+
return ret;
}
EXPORT_SYMBOL(v4l2_g_ctrl);
--
2.47.0
The logic at tpg_precalculate_line() blindly rescales the
buffer even when scaled_witdh is equal to zero. If this ever
happens, this will cause a division by zero.
Instead, add a WARN_ON() to trigger such cases and return
without doing any precalculation.
Fixes: 63881df94d3e ("[media] vivid: add the Test Pattern Generator")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
---
drivers/media/common/v4l2-tpg/v4l2-tpg-core.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
index c86343a4d0bf..a22f31515d7e 100644
--- a/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
+++ b/drivers/media/common/v4l2-tpg/v4l2-tpg-core.c
@@ -1795,6 +1795,9 @@ static void tpg_precalculate_line(struct tpg_data *tpg)
unsigned p;
unsigned x;
+ if (WARN_ON(tpg->src_width == 0))
+ return;
+
switch (tpg->pattern) {
case TPG_PAT_GREEN:
contrast = TPG_COLOR_100_RED;
--
2.47.0
The logic at get_edid_tag_location() returns either an offset
or an error condition. However, the error condition uses a
non-standard "-1" value.
Use instead -ENOENT to indicate that the tag was not found.
Fixes: 056f2821b631 ("media: cec: extron-da-hd-4k-plus: add the Extron DA HD 4K Plus CEC driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei(a)kernel.org>
---
.../cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c
index a526464af88c..7d03a36df5cf 100644
--- a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c
+++ b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c
@@ -348,12 +348,12 @@ static int get_edid_tag_location(const u8 *edid, unsigned int size,
/* Return if not a CTA-861 extension block */
if (size < 256 || edid[0] != 0x02 || edid[1] != 0x03)
- return -1;
+ return -ENOENT;
/* search tag */
d = edid[0x02] & 0x7f;
if (d <= 4)
- return -1;
+ return -ENOENT;
i = 0x04;
end = 0x00 + d;
@@ -371,7 +371,7 @@ static int get_edid_tag_location(const u8 *edid, unsigned int size,
return offset + i;
i += len + 1;
} while (i < end);
- return -1;
+ return -ENOENT;
}
static void extron_edid_crc(u8 *edid)
--
2.47.0