The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
f60ffa662d14 ("cifs: don't leak -ENOMEM in smb2_open_file()")
a9e17d3d74d1 ("cifs: fix static checker warning")
76894f3e2f71 ("cifs: improve symlink handling for smb2+")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f60ffa662d1427cfd31fe9d895c3566ac50bfe52 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc(a)cjr.nz>
Date: Mon, 19 Dec 2022 10:21:50 -0300
Subject: [PATCH] cifs: don't leak -ENOMEM in smb2_open_file()
A NULL error response might be a valid case where smb2_reconnect()
failed to reconnect the session and tcon due to a disconnected server
prior to issuing the I/O operation, so don't leak -ENOMEM to userspace
on such occasions.
Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+")
Signed-off-by: Paulo Alcantara (SUSE) <pc(a)cjr.nz>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index ffbd9a99fc12..ba6cc50af390 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -122,8 +122,8 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
struct smb2_hdr *hdr = err_iov.iov_base;
if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER))
- rc = -ENOMEM;
- else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ goto out;
+ if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov,
&data->symlink_target);
if (!rc) {
Hi,
drop perf-stat-display-event-stats-using-aggr-counts.patch
from queue-6.1 please.
It failed to compile on 6.1.y now.
Best Regards
Wang Yugui (wangyugui(a)e16-tech.com)
2022/12/27
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
4464853277d0 ("io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4464853277d0ccdb9914608dd1332f0fa2f9846f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Sun, 20 Nov 2022 10:18:45 -0700
Subject: [PATCH] io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and
wakeups
Pass in EPOLL_URING_WAKE when signaling eventfd or doing poll related
wakups, so that we can check for a circular event dependency between
eventfd and epoll. If this flag is set when our wakeup handlers are
called, then we know we have a dependency that needs to terminate
multishot requests.
eventfd and epoll are the only such possible dependencies.
Cc: stable(a)vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1299f9c8567a..762ecab801f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -495,7 +495,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
@@ -531,7 +531,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 69fbd27c7577..83013ee584d6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/io_uring_types.h>
+#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
@@ -211,12 +212,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
- * wake_up_all() may seem excessive, but io_wake_function() and
- * io_should_wake() handle the termination of the loop and only
- * wake as many waiters as we need to.
+ * Trigger waitqueue handler on all waiters on our waitqueue. This
+ * won't necessarily wake up all the tasks, io_should_wake() will make
+ * that decision.
+ *
+ * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
+ * set in the mask so that if we recurse back into our own poll
+ * waitqueue handlers, we know we have a dependency between eventfd or
+ * epoll and should terminate multishot poll at that point.
*/
if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
+ __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
+ poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8fb8e781c02d..22c9b2e0944a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -389,6 +389,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
return 0;
if (io_poll_get_ownership(req)) {
+ /*
+ * If we trigger a multishot poll off our own wakeup path,
+ * disable multishot as there is a circular dependency between
+ * CQ posting and triggering the event.
+ */
+ if (mask & EPOLL_URING_WAKE)
+ poll->events |= EPOLLONESHOT;
+
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
4464853277d0 ("io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups")
fc86f9d3bb49 ("io_uring: remove redundant memory barrier in io_req_local_work_add")
21a091b970cd ("io_uring: signal registered eventfd to process deferred task work")
d8e9214f119d ("io_uring: move io_eventfd_put")
c0e0d6ba25f1 ("io_uring: add IORING_SETUP_DEFER_TASKRUN")
b4c98d59a787 ("io_uring: introduce io_has_work")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4464853277d0ccdb9914608dd1332f0fa2f9846f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Sun, 20 Nov 2022 10:18:45 -0700
Subject: [PATCH] io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and
wakeups
Pass in EPOLL_URING_WAKE when signaling eventfd or doing poll related
wakups, so that we can check for a circular event dependency between
eventfd and epoll. If this flag is set when our wakeup handlers are
called, then we know we have a dependency that needs to terminate
multishot requests.
eventfd and epoll are the only such possible dependencies.
Cc: stable(a)vger.kernel.org # 6.0
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1299f9c8567a..762ecab801f2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -495,7 +495,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
@@ -531,7 +531,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
goto out;
if (likely(eventfd_signal_allowed())) {
- eventfd_signal(ev_fd->cq_ev_fd, 1);
+ eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 69fbd27c7577..83013ee584d6 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/io_uring_types.h>
+#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
@@ -211,12 +212,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
- * wake_up_all() may seem excessive, but io_wake_function() and
- * io_should_wake() handle the termination of the loop and only
- * wake as many waiters as we need to.
+ * Trigger waitqueue handler on all waiters on our waitqueue. This
+ * won't necessarily wake up all the tasks, io_should_wake() will make
+ * that decision.
+ *
+ * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
+ * set in the mask so that if we recurse back into our own poll
+ * waitqueue handlers, we know we have a dependency between eventfd or
+ * epoll and should terminate multishot poll at that point.
*/
if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
+ __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
+ poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8fb8e781c02d..22c9b2e0944a 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -389,6 +389,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
return 0;
if (io_poll_get_ownership(req)) {
+ /*
+ * If we trigger a multishot poll off our own wakeup path,
+ * disable multishot as there is a circular dependency between
+ * CQ posting and triggering the event.
+ */
+ if (mask & EPOLL_URING_WAKE)
+ poll->events |= EPOLLONESHOT;
+
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
813e693023ba ("blk-iolatency: Fix memory leak on add_disk() failures")
00ad6991bbae ("blk-cgroup: pass a gendisk to blkg_destroy_all")
e13793bae659 ("blk-throttle: pass a gendisk to blk_throtl_init and blk_throtl_exit")
9823538fb7ef ("blk-cgroup: pass a gendisk to blkcg_init_queue and blkcg_exit_queue")
4a69f325aa43 ("blk-cgroup: cleanup the blkg_lookup family of functions")
928f6f00a91e ("blk-cgroup: remove blk_queue_root_blkg")
33dc62796cb6 ("blk-cgroup: fix error unwinding in blkcg_init_queue")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 813e693023ba10da9e75067780f8378465bf27cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Sat, 10 Dec 2022 08:33:10 -1000
Subject: [PATCH] blk-iolatency: Fix memory leak on add_disk() failures
When a gendisk is successfully initialized but add_disk() fails such as when
a loop device has invalid number of minor device numbers specified,
blkcg_init_disk() is called during init and then blkcg_exit_disk() during
error handling. Unfortunately, iolatency gets initialized in the former but
doesn't get cleaned up in the latter.
This is because, in non-error cases, the cleanup is performed by
del_gendisk() calling rq_qos_exit(), the assumption being that rq_qos
policies, iolatency being one of them, can only be activated once the disk
is fully registered and visible. That assumption is true for wbt and iocost,
but not so for iolatency as it gets initialized before add_disk() is called.
It is desirable to lazy-init rq_qos policies because they are optional
features and add to hot path overhead once initialized - each IO has to walk
all the registered rq_qos policies. So, we want to switch iolatency to lazy
init too. However, that's a bigger change. As a fix for the immediate
problem, let's just add an extra call to rq_qos_exit() in blkcg_exit_disk().
This is safe because duplicate calls to rq_qos_exit() become noop's.
Signed-off-by: Tejun Heo <tj(a)kernel.org>
Reported-by: darklight2357(a)icloud.com
Cc: Josef Bacik <josef(a)toxicpanda.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Fixes: d70675121546 ("block: introduce blk-iolatency io controller")
Cc: stable(a)vger.kernel.org # v4.19+
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Link: https://lore.kernel.org/r/Y5TQ5gm3O4HXrXR3@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 50ac0dce95b8..ce6a2b7d3dfb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -33,6 +33,7 @@
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
+#include "blk-rq-qos.h"
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -1322,6 +1323,7 @@ int blkcg_init_disk(struct gendisk *disk)
void blkcg_exit_disk(struct gendisk *disk)
{
blkg_destroy_all(disk);
+ rq_qos_exit(disk->queue);
blk_throtl_exit(disk);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
162d053e15fe ("btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range")
2766ff61762c ("btrfs: update the number of bytes used by an inode atomically")
5893dfb98f25 ("btrfs: refactor btrfs_drop_extents() to make it easier to extend")
ac5887c8e013 ("btrfs: locking: remove all the blocking helpers")
a14b78ad06ab ("btrfs: introduce btrfs_inode_lock()/unlock()")
b8d8e1fd570a ("btrfs: introduce btrfs_write_check()")
c86537a42f86 ("btrfs: check FS error state bit early during write")
5e8b9ef30392 ("btrfs: move pos increment and pagecache extension to btrfs_buffered_write")
4e4cabece9f9 ("btrfs: split btrfs_direct_IO to read and write")
196d59ab9ccc ("btrfs: switch extent buffer tree lock to rw_semaphore")
0425e7badbdc ("btrfs: don't fallback to buffered read if we don't need to")
3c38c877fcb9 ("btrfs: sink inode argument in insert_ordered_extent_file_extent")
fc0d82e103c7 ("btrfs: sink total_data parameter in setup_items_for_insert")
3dc9dc8969dc ("btrfs: eliminate total_size parameter from setup_items_for_insert")
0cbb5bdfea26 ("btrfs: rename btrfs_insert_clone_extent() to a more generic name")
306bfec02b10 ("btrfs: rename btrfs_punch_hole_range() to a more generic name")
bf385648fa48 ("btrfs: rename struct btrfs_clone_extent_info to a more generic name")
fb870f6cdd72 ("btrfs: remove item_size member of struct btrfs_clone_extent_info")
8fccebfa534c ("btrfs: fix metadata reservation for fallocate that leads to transaction aborts")
53ac7ead2446 ("btrfs: make btrfs_invalidatepage work on btrfs_inode")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 162d053e15fe985f754ef495a96eb3db970c43ed Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 28 Nov 2022 15:07:30 +0000
Subject: [PATCH] btrfs: do not BUG_ON() on ENOMEM when dropping extent items
for a range
If we get -ENOMEM while dropping file extent items in a given range, at
btrfs_drop_extents(), due to failure to allocate memory when attempting to
increment the reference count for an extent or drop the reference count,
we handle it with a BUG_ON(). This is excessive, instead we can simply
abort the transaction and return the error to the caller. In fact most
callers of btrfs_drop_extents(), directly or indirectly, already abort
the transaction if btrfs_drop_extents() returns any error.
Also, we already have error paths at btrfs_drop_extents() that may return
-ENOMEM and in those cases we abort the transaction, like for example
anything that changes the b+tree may return -ENOMEM due to a failure to
allocate a new extent buffer when COWing an existing extent buffer, such
as a call to btrfs_duplicate_item() for example.
So replace the BUG_ON() calls with proper logic to abort the transaction
and return the error.
Reported-by: syzbot+0b1fb6b0108c27419f9f(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000089773e05ee4b9cb4@google.com/
CC: stable(a)vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 448b143a5cb2..91b00eb2440e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -380,7 +380,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
key.offset = args->start;
}
@@ -467,7 +470,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
key.offset - extent_offset, 0,
false);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
args->bytes_found += extent_end - key.offset;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
162d053e15fe ("btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range")
2766ff61762c ("btrfs: update the number of bytes used by an inode atomically")
5893dfb98f25 ("btrfs: refactor btrfs_drop_extents() to make it easier to extend")
ac5887c8e013 ("btrfs: locking: remove all the blocking helpers")
a14b78ad06ab ("btrfs: introduce btrfs_inode_lock()/unlock()")
b8d8e1fd570a ("btrfs: introduce btrfs_write_check()")
c86537a42f86 ("btrfs: check FS error state bit early during write")
5e8b9ef30392 ("btrfs: move pos increment and pagecache extension to btrfs_buffered_write")
4e4cabece9f9 ("btrfs: split btrfs_direct_IO to read and write")
196d59ab9ccc ("btrfs: switch extent buffer tree lock to rw_semaphore")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 162d053e15fe985f754ef495a96eb3db970c43ed Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 28 Nov 2022 15:07:30 +0000
Subject: [PATCH] btrfs: do not BUG_ON() on ENOMEM when dropping extent items
for a range
If we get -ENOMEM while dropping file extent items in a given range, at
btrfs_drop_extents(), due to failure to allocate memory when attempting to
increment the reference count for an extent or drop the reference count,
we handle it with a BUG_ON(). This is excessive, instead we can simply
abort the transaction and return the error to the caller. In fact most
callers of btrfs_drop_extents(), directly or indirectly, already abort
the transaction if btrfs_drop_extents() returns any error.
Also, we already have error paths at btrfs_drop_extents() that may return
-ENOMEM and in those cases we abort the transaction, like for example
anything that changes the b+tree may return -ENOMEM due to a failure to
allocate a new extent buffer when COWing an existing extent buffer, such
as a call to btrfs_duplicate_item() for example.
So replace the BUG_ON() calls with proper logic to abort the transaction
and return the error.
Reported-by: syzbot+0b1fb6b0108c27419f9f(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000089773e05ee4b9cb4@google.com/
CC: stable(a)vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 448b143a5cb2..91b00eb2440e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -380,7 +380,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
key.offset = args->start;
}
@@ -467,7 +470,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
key.offset - extent_offset, 0,
false);
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
args->bytes_found += extent_end - key.offset;
}
The memory for "llcc_driv_data" is allocated by the LLCC driver. But when
it is passed as "pvt_info" to the EDAC core, it will get freed during the
qcom_edac driver release. So when the qcom_edac driver gets probed again,
it will try to use the freed data leading to the use-after-free bug.
Fix this by not passing "llcc_driv_data" as pvt_info but rather reference
it using the "platform_data" in the qcom_edac driver.
Cc: <stable(a)vger.kernel.org> # 4.20
Fixes: 27450653f1db ("drivers: edac: Add EDAC driver support for QCOM SoCs")
Tested-by: Steev Klimaszewski <steev(a)kali.org> # Thinkpad X13s
Tested-by: Andrew Halaney <ahalaney(a)redhat.com> # sa8540p-ride
Reported-by: Steev Klimaszewski <steev(a)kali.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
---
drivers/edac/qcom_edac.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c
index 9e77fa84e84f..3256254c3722 100644
--- a/drivers/edac/qcom_edac.c
+++ b/drivers/edac/qcom_edac.c
@@ -252,7 +252,7 @@ dump_syn_reg_values(struct llcc_drv_data *drv, u32 bank, int err_type)
static int
dump_syn_reg(struct edac_device_ctl_info *edev_ctl, int err_type, u32 bank)
{
- struct llcc_drv_data *drv = edev_ctl->pvt_info;
+ struct llcc_drv_data *drv = edev_ctl->dev->platform_data;
int ret;
ret = dump_syn_reg_values(drv, bank, err_type);
@@ -289,7 +289,7 @@ static irqreturn_t
llcc_ecc_irq_handler(int irq, void *edev_ctl)
{
struct edac_device_ctl_info *edac_dev_ctl = edev_ctl;
- struct llcc_drv_data *drv = edac_dev_ctl->pvt_info;
+ struct llcc_drv_data *drv = edac_dev_ctl->dev->platform_data;
irqreturn_t irq_rc = IRQ_NONE;
u32 drp_error, trp_error, i;
int ret;
@@ -358,7 +358,6 @@ static int qcom_llcc_edac_probe(struct platform_device *pdev)
edev_ctl->dev_name = dev_name(dev);
edev_ctl->ctl_name = "llcc";
edev_ctl->panic_on_ue = LLCC_ERP_PANIC_ON_UE;
- edev_ctl->pvt_info = llcc_driv_data;
rc = edac_device_add_device(edev_ctl);
if (rc)
--
2.25.1
Following s390 build warnings / errors noticed on stable-rc 4.19 queue.
Regressions found on s390:
- build/gcc-11-tinyconfig
- build/gcc-10-tinyconfig
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
git_repo: https://gitlab.com/Linaro/lkft/mirrors/stable/linux-stable-rc-queues
git_describe: v4.19.269-363-g176f3d59718e
Build: v4.19.269-363-g176f3d59718e
Details: https://qa-reports.linaro.org/lkft/linux-stable-rc-queues-queue_4.19-sanity…
make --silent --keep-going --jobs=8
O=/home/tuxbuild/.cache/tuxmake/builds/1/build ARCH=s390
CROSS_COMPILE=s390x-linux-gnu- 'CC=sccache s390x-linux-gnu-gcc'
'HOSTCC=sccache gcc'
In function 'setup_lowcore_dat_off',
inlined from 'setup_arch' at /builds/linux/arch/s390/kernel/setup.c:958:2:
/builds/linux/arch/s390/kernel/setup.c:342:9: warning: 'memcpy'
reading 128 bytes from a region of size 0 [-Wstringop-overread]
342 | memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
343 | sizeof(lc->stfle_fac_list));
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/builds/linux/arch/s390/kernel/setup.c:344:9: warning: 'memcpy'
reading 128 bytes from a region of size 0 [-Wstringop-overread]
344 | memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
345 | sizeof(lc->alt_stfle_fac_list));
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /builds/linux/arch/s390/kernel/lgr.c:13:
In function 'stfle',
inlined from 'lgr_info_get' at /builds/linux/arch/s390/kernel/lgr.c:122:2:
/builds/linux/arch/s390/include/asm/facility.h:88:9: warning: 'memcpy'
reading 4 bytes from a region of size 0 [-Wstringop-overread]
88 | memcpy(stfle_fac_list, &S390_lowcore.stfl_fac_list, 4);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
s390x-linux-gnu-ld: drivers/base/platform.o: in function
`devm_platform_get_and_ioremap_resource':
platform.c:(.text+0x43a): undefined reference to `devm_ioremap_resource'
s390x-linux-gnu-ld: drivers/base/platform.o: in function
`devm_platform_ioremap_resource':
platform.c:(.text+0x478): undefined reference to `devm_ioremap_resource'
make[1]: *** [/builds/linux/Makefile:1055: vmlinux] Error 1
Build details,
https://qa-reports.linaro.org/lkft/linux-stable-rc-queues-queue_4.19-sanity…https://qa-reports.linaro.org/lkft/linux-stable-rc-queues-queue_4.19-sanity…
Build logs,
https://storage.tuxsuite.com/public/linaro/lkft/builds/2JXXJr2fBrwWGJISB1Ic…
Steps to reproduce:
--------------------
# To install tuxmake on your system globally:
# sudo pip3 install -U tuxmake
#
# See https://docs.tuxmake.org/ for complete documentation.
# Original tuxmake command with fragments listed below.
# tuxmake --runtime podman --target-arch s390 --toolchain gcc-11
--kconfig tinyconfig
--
Linaro LKFT
https://lkft.linaro.org
Changes since v1:
- V1: https://lore.kernel.org/lkml/cover.1670005163.git.reinette.chatre@intel.com/
- Cover trimmed after obtaining needed information.
- Added Reviewed-by tags.
- cc stable team.
- Please see individual patches for patch specific changes.
Dear Maintainers,
I have been using the IDXD driver to experiment with the upcoming core
changes in support of IMS ([1], [2], [3]). As part of this work I
happened to exercise the error paths within IDXD and encountered
a few issues that are addressed in this series. These changes are
independent from IMS and just aims to make the IDXD driver more
robust against errors.
Your feedback is greatly appreciated.
Reinette
[1] https://lore.kernel.org/lkml/20221111132706.104870257@linutronix.de
[2] https://lore.kernel.org/lkml/20221111131813.914374272@linutronix.dexo
[3] https://lore.kernel.org/lkml/20221111133158.196269823@linutronix.de
Reinette Chatre (3):
dmaengine: idxd: Let probe fail when workqueue cannot be enabled
dmaengine: idxd: Prevent use after free on completion memory
dmaengine: idxd: Do not call DMX TX callbacks during workqueue disable
drivers/dma/idxd/device.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
base-commit: 76dcd734eca23168cb008912c0f69ff408905235
--
2.34.1
From: Nathan Lynch <nathanl(a)linux.ibm.com>
[ Upstream commit ed2213bfb192ab51f09f12e9b49b5d482c6493f3 ]
rtas_os_term() is called during panic. Its behavior depends on a couple
of conditions in the /rtas node of the device tree, the traversal of
which entails locking and local IRQ state changes. If the kernel panics
while devtree_lock is held, rtas_os_term() as currently written could
hang.
Instead of discovering the relevant characteristics at panic time,
cache them in file-static variables at boot. Note the lookup for
"ibm,extended-os-term" is converted to of_property_read_bool() since it
is a boolean property, not an RTAS function token.
Signed-off-by: Nathan Lynch <nathanl(a)linux.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin(a)gmail.com>
Reviewed-by: Andrew Donnellan <ajd(a)linux.ibm.com>
[mpe: Incorporate suggested change from Nick]
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20221118150751.469393-4-nathanl@linux.ibm.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/powerpc/kernel/rtas.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index dd20e87f18f2..914d71879536 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -716,6 +716,7 @@ void __noreturn rtas_halt(void)
/* Must be in the RMO region, so we place it here */
static char rtas_os_term_buf[2048];
+static s32 ibm_os_term_token = RTAS_UNKNOWN_SERVICE;
void rtas_os_term(char *str)
{
@@ -727,14 +728,13 @@ void rtas_os_term(char *str)
* this property may terminate the partition which we want to avoid
* since it interferes with panic_timeout.
*/
- if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") ||
- RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term"))
+ if (ibm_os_term_token == RTAS_UNKNOWN_SERVICE)
return;
snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str);
do {
- status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL,
+ status = rtas_call(ibm_os_term_token, 1, 1, NULL,
__pa(rtas_os_term_buf));
} while (rtas_busy_delay(status));
@@ -1331,6 +1331,13 @@ void __init rtas_initialize(void)
no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry);
rtas.entry = no_entry ? rtas.base : entry;
+ /*
+ * Discover these now to avoid device tree lookups in the
+ * panic path.
+ */
+ if (of_property_read_bool(rtas.dev, "ibm,extended-os-term"))
+ ibm_os_term_token = rtas_token("ibm,os-term");
+
/* If RTAS was found, allocate the RMO buffer for it and look for
* the stop-self token if any
*/
--
2.35.1
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0")
18d1f200b380 ("loop: move loop_ctl_mutex locking into loop_add")
f9d107644aa4 ("loop: split loop_control_ioctl")
4157fe0b3d16 ("loop: don't call loop_lookup before adding a loop device")
d6da83d072c1 ("loop: remove the l argument to loop_add")
990e78116d38 ("block: loop: fix deadlock between open and remove")
6cc8e7430801 ("loop: scale loop device by introducing per device lock")
aeb2b0b1a3da ("block: drop dead assignments in loop_init()")
8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand")
200f93377220 ("loop: be paranoid on exit and prevent new additions / removals")
62ab466ca881 ("loop: Move loop_set_status_from_info() and friends up")
0c3796c24459 ("loop: Factor out configuring loop from status")
b0bd158dd630 ("loop: Refactor loop_set_status() size calculation")
5795b6f5607f ("loop: Factor out setting loop device size")
083a6a50783e ("loop: Remove sector_t truncation checks")
7c5014b0987a ("loop: Call loop_config_discard() only after new config is applied")
33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener")
56a85fd8376e ("loop: properly observe rotational flag of underlying device")
758a58d0bc67 ("loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()")
5db470e229e2 ("loop: drop caches if offset or block_size are changed")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85c50197716c60fe57f411339c579462e563ac57 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres(a)google.com>
Date: Thu, 8 Dec 2022 13:29:01 -0800
Subject: [PATCH] loop: Fix the max_loop commandline argument treatment when it
is set to 0
Currently, the max_loop commandline argument can be used to specify how
many loop block devices are created at init time. If it is not
specified on the commandline, CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block
devices will be created.
The max_loop commandline argument can be used to override the value of
CONFIG_BLK_DEV_LOOP_MIN_COUNT. However, when max_loop is set to 0
through the commandline, the current logic treats it as if it had not
been set, and creates CONFIG_BLK_DEV_LOOP_MIN_COUNT devices anyway.
Fix this by starting max_loop off as set to CONFIG_BLK_DEV_LOOP_MIN_COUNT.
This preserves the intended behavior of creating
CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block devices if the max_loop
commandline parameter is not specified, and allowing max_loop to
be respected for all values, including 0.
This allows environments that can create all of their required loop
block devices on demand to not have to unnecessarily preallocate loop
block devices.
Fixes: 732850827450 ("remove artificial software max_loop limit")
Cc: stable(a)vger.kernel.org
Cc: Ken Chen <kenchen(a)google.com>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres(a)google.com>
Link: https://lore.kernel.org/r/20221208212902.765781-1-isaacmanjarres@google.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1f8f3b87bdfa..df628e30bca4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1773,7 +1773,16 @@ static const struct block_device_operations lo_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int max_loop;
+
+/*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
module_param(max_part, int, 0444);
@@ -2181,7 +2190,7 @@ MODULE_ALIAS("devname:loop-control");
static int __init loop_init(void)
{
- int i, nr;
+ int i;
int err;
part_shift = 0;
@@ -2209,19 +2218,6 @@ static int __init loop_init(void)
goto err_out;
}
- /*
- * If max_loop is specified, create that many devices upfront.
- * This also becomes a hard limit. If max_loop is not specified,
- * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
- * init time. Loop devices can be requested on-demand with the
- * /dev/loop-control interface, or be instantiated by accessing
- * a 'dead' device node.
- */
- if (max_loop)
- nr = max_loop;
- else
- nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
-
err = misc_register(&loop_misc);
if (err < 0)
goto err_out;
@@ -2233,7 +2229,7 @@ static int __init loop_init(void)
}
/* pre-create number of devices given by config or max_loop */
- for (i = 0; i < nr; i++)
+ for (i = 0; i < max_loop; i++)
loop_add(i);
printk(KERN_INFO "loop: module loaded\n");
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0")
18d1f200b380 ("loop: move loop_ctl_mutex locking into loop_add")
f9d107644aa4 ("loop: split loop_control_ioctl")
4157fe0b3d16 ("loop: don't call loop_lookup before adding a loop device")
d6da83d072c1 ("loop: remove the l argument to loop_add")
990e78116d38 ("block: loop: fix deadlock between open and remove")
6cc8e7430801 ("loop: scale loop device by introducing per device lock")
aeb2b0b1a3da ("block: drop dead assignments in loop_init()")
8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand")
200f93377220 ("loop: be paranoid on exit and prevent new additions / removals")
62ab466ca881 ("loop: Move loop_set_status_from_info() and friends up")
0c3796c24459 ("loop: Factor out configuring loop from status")
b0bd158dd630 ("loop: Refactor loop_set_status() size calculation")
5795b6f5607f ("loop: Factor out setting loop device size")
083a6a50783e ("loop: Remove sector_t truncation checks")
7c5014b0987a ("loop: Call loop_config_discard() only after new config is applied")
33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener")
56a85fd8376e ("loop: properly observe rotational flag of underlying device")
758a58d0bc67 ("loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()")
5db470e229e2 ("loop: drop caches if offset or block_size are changed")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85c50197716c60fe57f411339c579462e563ac57 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres(a)google.com>
Date: Thu, 8 Dec 2022 13:29:01 -0800
Subject: [PATCH] loop: Fix the max_loop commandline argument treatment when it
is set to 0
Currently, the max_loop commandline argument can be used to specify how
many loop block devices are created at init time. If it is not
specified on the commandline, CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block
devices will be created.
The max_loop commandline argument can be used to override the value of
CONFIG_BLK_DEV_LOOP_MIN_COUNT. However, when max_loop is set to 0
through the commandline, the current logic treats it as if it had not
been set, and creates CONFIG_BLK_DEV_LOOP_MIN_COUNT devices anyway.
Fix this by starting max_loop off as set to CONFIG_BLK_DEV_LOOP_MIN_COUNT.
This preserves the intended behavior of creating
CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block devices if the max_loop
commandline parameter is not specified, and allowing max_loop to
be respected for all values, including 0.
This allows environments that can create all of their required loop
block devices on demand to not have to unnecessarily preallocate loop
block devices.
Fixes: 732850827450 ("remove artificial software max_loop limit")
Cc: stable(a)vger.kernel.org
Cc: Ken Chen <kenchen(a)google.com>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres(a)google.com>
Link: https://lore.kernel.org/r/20221208212902.765781-1-isaacmanjarres@google.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1f8f3b87bdfa..df628e30bca4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1773,7 +1773,16 @@ static const struct block_device_operations lo_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int max_loop;
+
+/*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
module_param(max_part, int, 0444);
@@ -2181,7 +2190,7 @@ MODULE_ALIAS("devname:loop-control");
static int __init loop_init(void)
{
- int i, nr;
+ int i;
int err;
part_shift = 0;
@@ -2209,19 +2218,6 @@ static int __init loop_init(void)
goto err_out;
}
- /*
- * If max_loop is specified, create that many devices upfront.
- * This also becomes a hard limit. If max_loop is not specified,
- * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
- * init time. Loop devices can be requested on-demand with the
- * /dev/loop-control interface, or be instantiated by accessing
- * a 'dead' device node.
- */
- if (max_loop)
- nr = max_loop;
- else
- nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
-
err = misc_register(&loop_misc);
if (err < 0)
goto err_out;
@@ -2233,7 +2229,7 @@ static int __init loop_init(void)
}
/* pre-create number of devices given by config or max_loop */
- for (i = 0; i < nr; i++)
+ for (i = 0; i < max_loop; i++)
loop_add(i);
printk(KERN_INFO "loop: module loaded\n");
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0")
18d1f200b380 ("loop: move loop_ctl_mutex locking into loop_add")
f9d107644aa4 ("loop: split loop_control_ioctl")
4157fe0b3d16 ("loop: don't call loop_lookup before adding a loop device")
d6da83d072c1 ("loop: remove the l argument to loop_add")
990e78116d38 ("block: loop: fix deadlock between open and remove")
6cc8e7430801 ("loop: scale loop device by introducing per device lock")
aeb2b0b1a3da ("block: drop dead assignments in loop_init()")
8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand")
200f93377220 ("loop: be paranoid on exit and prevent new additions / removals")
62ab466ca881 ("loop: Move loop_set_status_from_info() and friends up")
0c3796c24459 ("loop: Factor out configuring loop from status")
b0bd158dd630 ("loop: Refactor loop_set_status() size calculation")
5795b6f5607f ("loop: Factor out setting loop device size")
083a6a50783e ("loop: Remove sector_t truncation checks")
7c5014b0987a ("loop: Call loop_config_discard() only after new config is applied")
33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener")
56a85fd8376e ("loop: properly observe rotational flag of underlying device")
758a58d0bc67 ("loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()")
5db470e229e2 ("loop: drop caches if offset or block_size are changed")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85c50197716c60fe57f411339c579462e563ac57 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres(a)google.com>
Date: Thu, 8 Dec 2022 13:29:01 -0800
Subject: [PATCH] loop: Fix the max_loop commandline argument treatment when it
is set to 0
Currently, the max_loop commandline argument can be used to specify how
many loop block devices are created at init time. If it is not
specified on the commandline, CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block
devices will be created.
The max_loop commandline argument can be used to override the value of
CONFIG_BLK_DEV_LOOP_MIN_COUNT. However, when max_loop is set to 0
through the commandline, the current logic treats it as if it had not
been set, and creates CONFIG_BLK_DEV_LOOP_MIN_COUNT devices anyway.
Fix this by starting max_loop off as set to CONFIG_BLK_DEV_LOOP_MIN_COUNT.
This preserves the intended behavior of creating
CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block devices if the max_loop
commandline parameter is not specified, and allowing max_loop to
be respected for all values, including 0.
This allows environments that can create all of their required loop
block devices on demand to not have to unnecessarily preallocate loop
block devices.
Fixes: 732850827450 ("remove artificial software max_loop limit")
Cc: stable(a)vger.kernel.org
Cc: Ken Chen <kenchen(a)google.com>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres(a)google.com>
Link: https://lore.kernel.org/r/20221208212902.765781-1-isaacmanjarres@google.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1f8f3b87bdfa..df628e30bca4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1773,7 +1773,16 @@ static const struct block_device_operations lo_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int max_loop;
+
+/*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
module_param(max_part, int, 0444);
@@ -2181,7 +2190,7 @@ MODULE_ALIAS("devname:loop-control");
static int __init loop_init(void)
{
- int i, nr;
+ int i;
int err;
part_shift = 0;
@@ -2209,19 +2218,6 @@ static int __init loop_init(void)
goto err_out;
}
- /*
- * If max_loop is specified, create that many devices upfront.
- * This also becomes a hard limit. If max_loop is not specified,
- * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
- * init time. Loop devices can be requested on-demand with the
- * /dev/loop-control interface, or be instantiated by accessing
- * a 'dead' device node.
- */
- if (max_loop)
- nr = max_loop;
- else
- nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
-
err = misc_register(&loop_misc);
if (err < 0)
goto err_out;
@@ -2233,7 +2229,7 @@ static int __init loop_init(void)
}
/* pre-create number of devices given by config or max_loop */
- for (i = 0; i < nr; i++)
+ for (i = 0; i < max_loop; i++)
loop_add(i);
printk(KERN_INFO "loop: module loaded\n");
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0")
18d1f200b380 ("loop: move loop_ctl_mutex locking into loop_add")
f9d107644aa4 ("loop: split loop_control_ioctl")
4157fe0b3d16 ("loop: don't call loop_lookup before adding a loop device")
d6da83d072c1 ("loop: remove the l argument to loop_add")
990e78116d38 ("block: loop: fix deadlock between open and remove")
6cc8e7430801 ("loop: scale loop device by introducing per device lock")
aeb2b0b1a3da ("block: drop dead assignments in loop_init()")
8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand")
200f93377220 ("loop: be paranoid on exit and prevent new additions / removals")
62ab466ca881 ("loop: Move loop_set_status_from_info() and friends up")
0c3796c24459 ("loop: Factor out configuring loop from status")
b0bd158dd630 ("loop: Refactor loop_set_status() size calculation")
5795b6f5607f ("loop: Factor out setting loop device size")
083a6a50783e ("loop: Remove sector_t truncation checks")
7c5014b0987a ("loop: Call loop_config_discard() only after new config is applied")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85c50197716c60fe57f411339c579462e563ac57 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres(a)google.com>
Date: Thu, 8 Dec 2022 13:29:01 -0800
Subject: [PATCH] loop: Fix the max_loop commandline argument treatment when it
is set to 0
Currently, the max_loop commandline argument can be used to specify how
many loop block devices are created at init time. If it is not
specified on the commandline, CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block
devices will be created.
The max_loop commandline argument can be used to override the value of
CONFIG_BLK_DEV_LOOP_MIN_COUNT. However, when max_loop is set to 0
through the commandline, the current logic treats it as if it had not
been set, and creates CONFIG_BLK_DEV_LOOP_MIN_COUNT devices anyway.
Fix this by starting max_loop off as set to CONFIG_BLK_DEV_LOOP_MIN_COUNT.
This preserves the intended behavior of creating
CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block devices if the max_loop
commandline parameter is not specified, and allowing max_loop to
be respected for all values, including 0.
This allows environments that can create all of their required loop
block devices on demand to not have to unnecessarily preallocate loop
block devices.
Fixes: 732850827450 ("remove artificial software max_loop limit")
Cc: stable(a)vger.kernel.org
Cc: Ken Chen <kenchen(a)google.com>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres(a)google.com>
Link: https://lore.kernel.org/r/20221208212902.765781-1-isaacmanjarres@google.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1f8f3b87bdfa..df628e30bca4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1773,7 +1773,16 @@ static const struct block_device_operations lo_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int max_loop;
+
+/*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
module_param(max_part, int, 0444);
@@ -2181,7 +2190,7 @@ MODULE_ALIAS("devname:loop-control");
static int __init loop_init(void)
{
- int i, nr;
+ int i;
int err;
part_shift = 0;
@@ -2209,19 +2218,6 @@ static int __init loop_init(void)
goto err_out;
}
- /*
- * If max_loop is specified, create that many devices upfront.
- * This also becomes a hard limit. If max_loop is not specified,
- * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
- * init time. Loop devices can be requested on-demand with the
- * /dev/loop-control interface, or be instantiated by accessing
- * a 'dead' device node.
- */
- if (max_loop)
- nr = max_loop;
- else
- nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
-
err = misc_register(&loop_misc);
if (err < 0)
goto err_out;
@@ -2233,7 +2229,7 @@ static int __init loop_init(void)
}
/* pre-create number of devices given by config or max_loop */
- for (i = 0; i < nr; i++)
+ for (i = 0; i < max_loop; i++)
loop_add(i);
printk(KERN_INFO "loop: module loaded\n");
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
85c50197716c ("loop: Fix the max_loop commandline argument treatment when it is set to 0")
18d1f200b380 ("loop: move loop_ctl_mutex locking into loop_add")
f9d107644aa4 ("loop: split loop_control_ioctl")
4157fe0b3d16 ("loop: don't call loop_lookup before adding a loop device")
d6da83d072c1 ("loop: remove the l argument to loop_add")
990e78116d38 ("block: loop: fix deadlock between open and remove")
6cc8e7430801 ("loop: scale loop device by introducing per device lock")
aeb2b0b1a3da ("block: drop dead assignments in loop_init()")
8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85c50197716c60fe57f411339c579462e563ac57 Mon Sep 17 00:00:00 2001
From: "Isaac J. Manjarres" <isaacmanjarres(a)google.com>
Date: Thu, 8 Dec 2022 13:29:01 -0800
Subject: [PATCH] loop: Fix the max_loop commandline argument treatment when it
is set to 0
Currently, the max_loop commandline argument can be used to specify how
many loop block devices are created at init time. If it is not
specified on the commandline, CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block
devices will be created.
The max_loop commandline argument can be used to override the value of
CONFIG_BLK_DEV_LOOP_MIN_COUNT. However, when max_loop is set to 0
through the commandline, the current logic treats it as if it had not
been set, and creates CONFIG_BLK_DEV_LOOP_MIN_COUNT devices anyway.
Fix this by starting max_loop off as set to CONFIG_BLK_DEV_LOOP_MIN_COUNT.
This preserves the intended behavior of creating
CONFIG_BLK_DEV_LOOP_MIN_COUNT loop block devices if the max_loop
commandline parameter is not specified, and allowing max_loop to
be respected for all values, including 0.
This allows environments that can create all of their required loop
block devices on demand to not have to unnecessarily preallocate loop
block devices.
Fixes: 732850827450 ("remove artificial software max_loop limit")
Cc: stable(a)vger.kernel.org
Cc: Ken Chen <kenchen(a)google.com>
Signed-off-by: Isaac J. Manjarres <isaacmanjarres(a)google.com>
Link: https://lore.kernel.org/r/20221208212902.765781-1-isaacmanjarres@google.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1f8f3b87bdfa..df628e30bca4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1773,7 +1773,16 @@ static const struct block_device_operations lo_fops = {
/*
* And now the modules code and kernel interface.
*/
-static int max_loop;
+
+/*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
module_param(max_loop, int, 0444);
MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
module_param(max_part, int, 0444);
@@ -2181,7 +2190,7 @@ MODULE_ALIAS("devname:loop-control");
static int __init loop_init(void)
{
- int i, nr;
+ int i;
int err;
part_shift = 0;
@@ -2209,19 +2218,6 @@ static int __init loop_init(void)
goto err_out;
}
- /*
- * If max_loop is specified, create that many devices upfront.
- * This also becomes a hard limit. If max_loop is not specified,
- * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
- * init time. Loop devices can be requested on-demand with the
- * /dev/loop-control interface, or be instantiated by accessing
- * a 'dead' device node.
- */
- if (max_loop)
- nr = max_loop;
- else
- nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
-
err = misc_register(&loop_misc);
if (err < 0)
goto err_out;
@@ -2233,7 +2229,7 @@ static int __init loop_init(void)
}
/* pre-create number of devices given by config or max_loop */
- for (i = 0; i < nr; i++)
+ for (i = 0; i < max_loop; i++)
loop_add(i);
printk(KERN_INFO "loop: module loaded\n");
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
97a48da1619b ("usb: dwc3: qcom: Fix memory leak in dwc3_qcom_interconnect_init")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 97a48da1619ba6bd42a0e5da0a03aa490a9496b1 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006(a)gmail.com>
Date: Tue, 6 Dec 2022 12:17:31 +0400
Subject: [PATCH] usb: dwc3: qcom: Fix memory leak in
dwc3_qcom_interconnect_init
of_icc_get() alloc resources for path handle, we should release it when not
need anymore. Like the release in dwc3_qcom_interconnect_exit() function.
Add icc_put() in error handling to fix this.
Fixes: bea46b981515 ("usb: dwc3: qcom: Add interconnect support in dwc3 driver")
Cc: stable <stable(a)kernel.org>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
Link: https://lore.kernel.org/r/20221206081731.818107-1-linmq006@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c
index 7c40f3ffc054..b0a0351d2d8b 100644
--- a/drivers/usb/dwc3/dwc3-qcom.c
+++ b/drivers/usb/dwc3/dwc3-qcom.c
@@ -261,7 +261,8 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
if (IS_ERR(qcom->icc_path_apps)) {
dev_err(dev, "failed to get apps-usb path: %ld\n",
PTR_ERR(qcom->icc_path_apps));
- return PTR_ERR(qcom->icc_path_apps);
+ ret = PTR_ERR(qcom->icc_path_apps);
+ goto put_path_ddr;
}
max_speed = usb_get_maximum_speed(&qcom->dwc3->dev);
@@ -274,16 +275,22 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
}
if (ret) {
dev_err(dev, "failed to set bandwidth for usb-ddr path: %d\n", ret);
- return ret;
+ goto put_path_apps;
}
ret = icc_set_bw(qcom->icc_path_apps, APPS_USB_AVG_BW, APPS_USB_PEAK_BW);
if (ret) {
dev_err(dev, "failed to set bandwidth for apps-usb path: %d\n", ret);
- return ret;
+ goto put_path_apps;
}
return 0;
+
+put_path_apps:
+ icc_put(qcom->icc_path_apps);
+put_path_ddr:
+ icc_put(qcom->icc_path_ddr);
+ return ret;
}
/**
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
97a48da1619b ("usb: dwc3: qcom: Fix memory leak in dwc3_qcom_interconnect_init")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 97a48da1619ba6bd42a0e5da0a03aa490a9496b1 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006(a)gmail.com>
Date: Tue, 6 Dec 2022 12:17:31 +0400
Subject: [PATCH] usb: dwc3: qcom: Fix memory leak in
dwc3_qcom_interconnect_init
of_icc_get() alloc resources for path handle, we should release it when not
need anymore. Like the release in dwc3_qcom_interconnect_exit() function.
Add icc_put() in error handling to fix this.
Fixes: bea46b981515 ("usb: dwc3: qcom: Add interconnect support in dwc3 driver")
Cc: stable <stable(a)kernel.org>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
Link: https://lore.kernel.org/r/20221206081731.818107-1-linmq006@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c
index 7c40f3ffc054..b0a0351d2d8b 100644
--- a/drivers/usb/dwc3/dwc3-qcom.c
+++ b/drivers/usb/dwc3/dwc3-qcom.c
@@ -261,7 +261,8 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
if (IS_ERR(qcom->icc_path_apps)) {
dev_err(dev, "failed to get apps-usb path: %ld\n",
PTR_ERR(qcom->icc_path_apps));
- return PTR_ERR(qcom->icc_path_apps);
+ ret = PTR_ERR(qcom->icc_path_apps);
+ goto put_path_ddr;
}
max_speed = usb_get_maximum_speed(&qcom->dwc3->dev);
@@ -274,16 +275,22 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
}
if (ret) {
dev_err(dev, "failed to set bandwidth for usb-ddr path: %d\n", ret);
- return ret;
+ goto put_path_apps;
}
ret = icc_set_bw(qcom->icc_path_apps, APPS_USB_AVG_BW, APPS_USB_PEAK_BW);
if (ret) {
dev_err(dev, "failed to set bandwidth for apps-usb path: %d\n", ret);
- return ret;
+ goto put_path_apps;
}
return 0;
+
+put_path_apps:
+ icc_put(qcom->icc_path_apps);
+put_path_ddr:
+ icc_put(qcom->icc_path_ddr);
+ return ret;
}
/**
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
97a48da1619b ("usb: dwc3: qcom: Fix memory leak in dwc3_qcom_interconnect_init")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 97a48da1619ba6bd42a0e5da0a03aa490a9496b1 Mon Sep 17 00:00:00 2001
From: Miaoqian Lin <linmq006(a)gmail.com>
Date: Tue, 6 Dec 2022 12:17:31 +0400
Subject: [PATCH] usb: dwc3: qcom: Fix memory leak in
dwc3_qcom_interconnect_init
of_icc_get() alloc resources for path handle, we should release it when not
need anymore. Like the release in dwc3_qcom_interconnect_exit() function.
Add icc_put() in error handling to fix this.
Fixes: bea46b981515 ("usb: dwc3: qcom: Add interconnect support in dwc3 driver")
Cc: stable <stable(a)kernel.org>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
Link: https://lore.kernel.org/r/20221206081731.818107-1-linmq006@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/dwc3-qcom.c b/drivers/usb/dwc3/dwc3-qcom.c
index 7c40f3ffc054..b0a0351d2d8b 100644
--- a/drivers/usb/dwc3/dwc3-qcom.c
+++ b/drivers/usb/dwc3/dwc3-qcom.c
@@ -261,7 +261,8 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
if (IS_ERR(qcom->icc_path_apps)) {
dev_err(dev, "failed to get apps-usb path: %ld\n",
PTR_ERR(qcom->icc_path_apps));
- return PTR_ERR(qcom->icc_path_apps);
+ ret = PTR_ERR(qcom->icc_path_apps);
+ goto put_path_ddr;
}
max_speed = usb_get_maximum_speed(&qcom->dwc3->dev);
@@ -274,16 +275,22 @@ static int dwc3_qcom_interconnect_init(struct dwc3_qcom *qcom)
}
if (ret) {
dev_err(dev, "failed to set bandwidth for usb-ddr path: %d\n", ret);
- return ret;
+ goto put_path_apps;
}
ret = icc_set_bw(qcom->icc_path_apps, APPS_USB_AVG_BW, APPS_USB_PEAK_BW);
if (ret) {
dev_err(dev, "failed to set bandwidth for apps-usb path: %d\n", ret);
- return ret;
+ goto put_path_apps;
}
return 0;
+
+put_path_apps:
+ icc_put(qcom->icc_path_apps);
+put_path_ddr:
+ icc_put(qcom->icc_path_ddr);
+ return ret;
}
/**
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
62c73bfea048 ("usb: dwc3: Fix race between dwc3_set_mode and __dwc3_set_mode")
07903626d988 ("usb: dwc3: core: Do not perform GCTL_CORE_SOFTRESET during bootup")
afbd04e66e5d ("usb: dwc3: core: Deprecate GCTL.CORESOFTRESET")
f88359e1588b ("usb: dwc3: core: Do core softreset when switch mode")
f580170f135a ("usb: dwc3: Add splitdisable quirk for Hisilicon Kirin Soc")
dc336b19e82d ("usb: dwc3: core: do not queue work if dr_mode is not USB_DR_MODE_OTG")
c2cd3452d5f8 ("usb: dwc3: support continuous runtime PM with dual role")
a0a465569b45 ("usb: dwc3: remove generic PHY calibrate() calls")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 62c73bfea048e66168df09da6d3e4510ecda40bb Mon Sep 17 00:00:00 2001
From: Sven Peter <sven(a)svenpeter.dev>
Date: Mon, 28 Nov 2022 17:15:26 +0100
Subject: [PATCH] usb: dwc3: Fix race between dwc3_set_mode and __dwc3_set_mode
dwc->desired_dr_role is changed by dwc3_set_mode inside a spinlock but
then read by __dwc3_set_mode outside of that lock. This can lead to a
race condition when very quick successive role switch events happen:
CPU A
dwc3_set_mode(DWC3_GCTL_PRTCAP_HOST) // first role switch event
spin_lock_irqsave(&dwc->lock, flags);
dwc->desired_dr_role = mode; // DWC3_GCTL_PRTCAP_HOST
spin_unlock_irqrestore(&dwc->lock, flags);
queue_work(system_freezable_wq, &dwc->drd_work);
CPU B
__dwc3_set_mode
// ....
spin_lock_irqsave(&dwc->lock, flags);
// desired_dr_role is DWC3_GCTL_PRTCAP_HOST
dwc3_set_prtcap(dwc, dwc->desired_dr_role);
spin_unlock_irqrestore(&dwc->lock, flags);
CPU A
dwc3_set_mode(DWC3_GCTL_PRTCAP_DEVICE) // second event
spin_lock_irqsave(&dwc->lock, flags);
dwc->desired_dr_role = mode; // DWC3_GCTL_PRTCAP_DEVICE
spin_unlock_irqrestore(&dwc->lock, flags);
CPU B (continues running __dwc3_set_mode)
switch (dwc->desired_dr_role) { // DWC3_GCTL_PRTCAP_DEVICE
// ....
case DWC3_GCTL_PRTCAP_DEVICE:
// ....
ret = dwc3_gadget_init(dwc);
We then have DWC3_GCTL.DWC3_GCTL_PRTCAPDIR = DWC3_GCTL_PRTCAP_HOST and
dwc->current_dr_role = DWC3_GCTL_PRTCAP_HOST but initialized the
controller in device mode. It's also possible to get into a state
where both host and device are intialized at the same time.
Fix this race by creating a local copy of desired_dr_role inside
__dwc3_set_mode while holding dwc->lock.
Fixes: 41ce1456e1db ("usb: dwc3: core: make dwc3_set_mode() work properly")
Cc: stable <stable(a)kernel.org>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Sven Peter <sven(a)svenpeter.dev>
Link: https://lore.kernel.org/r/20221128161526.79730-1-sven@svenpeter.dev
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 1f348bc867c2..fc38a8b13efa 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -122,21 +122,25 @@ static void __dwc3_set_mode(struct work_struct *work)
unsigned long flags;
int ret;
u32 reg;
+ u32 desired_dr_role;
mutex_lock(&dwc->mutex);
+ spin_lock_irqsave(&dwc->lock, flags);
+ desired_dr_role = dwc->desired_dr_role;
+ spin_unlock_irqrestore(&dwc->lock, flags);
pm_runtime_get_sync(dwc->dev);
if (dwc->current_dr_role == DWC3_GCTL_PRTCAP_OTG)
dwc3_otg_update(dwc, 0);
- if (!dwc->desired_dr_role)
+ if (!desired_dr_role)
goto out;
- if (dwc->desired_dr_role == dwc->current_dr_role)
+ if (desired_dr_role == dwc->current_dr_role)
goto out;
- if (dwc->desired_dr_role == DWC3_GCTL_PRTCAP_OTG && dwc->edev)
+ if (desired_dr_role == DWC3_GCTL_PRTCAP_OTG && dwc->edev)
goto out;
switch (dwc->current_dr_role) {
@@ -164,7 +168,7 @@ static void __dwc3_set_mode(struct work_struct *work)
*/
if (dwc->current_dr_role && ((DWC3_IP_IS(DWC3) ||
DWC3_VER_IS_PRIOR(DWC31, 190A)) &&
- dwc->desired_dr_role != DWC3_GCTL_PRTCAP_OTG)) {
+ desired_dr_role != DWC3_GCTL_PRTCAP_OTG)) {
reg = dwc3_readl(dwc->regs, DWC3_GCTL);
reg |= DWC3_GCTL_CORESOFTRESET;
dwc3_writel(dwc->regs, DWC3_GCTL, reg);
@@ -184,11 +188,11 @@ static void __dwc3_set_mode(struct work_struct *work)
spin_lock_irqsave(&dwc->lock, flags);
- dwc3_set_prtcap(dwc, dwc->desired_dr_role);
+ dwc3_set_prtcap(dwc, desired_dr_role);
spin_unlock_irqrestore(&dwc->lock, flags);
- switch (dwc->desired_dr_role) {
+ switch (desired_dr_role) {
case DWC3_GCTL_PRTCAP_HOST:
ret = dwc3_host_init(dwc);
if (ret) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
62c73bfea048 ("usb: dwc3: Fix race between dwc3_set_mode and __dwc3_set_mode")
07903626d988 ("usb: dwc3: core: Do not perform GCTL_CORE_SOFTRESET during bootup")
afbd04e66e5d ("usb: dwc3: core: Deprecate GCTL.CORESOFTRESET")
f88359e1588b ("usb: dwc3: core: Do core softreset when switch mode")
f580170f135a ("usb: dwc3: Add splitdisable quirk for Hisilicon Kirin Soc")
dc336b19e82d ("usb: dwc3: core: do not queue work if dr_mode is not USB_DR_MODE_OTG")
c2cd3452d5f8 ("usb: dwc3: support continuous runtime PM with dual role")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 62c73bfea048e66168df09da6d3e4510ecda40bb Mon Sep 17 00:00:00 2001
From: Sven Peter <sven(a)svenpeter.dev>
Date: Mon, 28 Nov 2022 17:15:26 +0100
Subject: [PATCH] usb: dwc3: Fix race between dwc3_set_mode and __dwc3_set_mode
dwc->desired_dr_role is changed by dwc3_set_mode inside a spinlock but
then read by __dwc3_set_mode outside of that lock. This can lead to a
race condition when very quick successive role switch events happen:
CPU A
dwc3_set_mode(DWC3_GCTL_PRTCAP_HOST) // first role switch event
spin_lock_irqsave(&dwc->lock, flags);
dwc->desired_dr_role = mode; // DWC3_GCTL_PRTCAP_HOST
spin_unlock_irqrestore(&dwc->lock, flags);
queue_work(system_freezable_wq, &dwc->drd_work);
CPU B
__dwc3_set_mode
// ....
spin_lock_irqsave(&dwc->lock, flags);
// desired_dr_role is DWC3_GCTL_PRTCAP_HOST
dwc3_set_prtcap(dwc, dwc->desired_dr_role);
spin_unlock_irqrestore(&dwc->lock, flags);
CPU A
dwc3_set_mode(DWC3_GCTL_PRTCAP_DEVICE) // second event
spin_lock_irqsave(&dwc->lock, flags);
dwc->desired_dr_role = mode; // DWC3_GCTL_PRTCAP_DEVICE
spin_unlock_irqrestore(&dwc->lock, flags);
CPU B (continues running __dwc3_set_mode)
switch (dwc->desired_dr_role) { // DWC3_GCTL_PRTCAP_DEVICE
// ....
case DWC3_GCTL_PRTCAP_DEVICE:
// ....
ret = dwc3_gadget_init(dwc);
We then have DWC3_GCTL.DWC3_GCTL_PRTCAPDIR = DWC3_GCTL_PRTCAP_HOST and
dwc->current_dr_role = DWC3_GCTL_PRTCAP_HOST but initialized the
controller in device mode. It's also possible to get into a state
where both host and device are intialized at the same time.
Fix this race by creating a local copy of desired_dr_role inside
__dwc3_set_mode while holding dwc->lock.
Fixes: 41ce1456e1db ("usb: dwc3: core: make dwc3_set_mode() work properly")
Cc: stable <stable(a)kernel.org>
Acked-by: Thinh Nguyen <Thinh.Nguyen(a)synopsys.com>
Signed-off-by: Sven Peter <sven(a)svenpeter.dev>
Link: https://lore.kernel.org/r/20221128161526.79730-1-sven@svenpeter.dev
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 1f348bc867c2..fc38a8b13efa 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -122,21 +122,25 @@ static void __dwc3_set_mode(struct work_struct *work)
unsigned long flags;
int ret;
u32 reg;
+ u32 desired_dr_role;
mutex_lock(&dwc->mutex);
+ spin_lock_irqsave(&dwc->lock, flags);
+ desired_dr_role = dwc->desired_dr_role;
+ spin_unlock_irqrestore(&dwc->lock, flags);
pm_runtime_get_sync(dwc->dev);
if (dwc->current_dr_role == DWC3_GCTL_PRTCAP_OTG)
dwc3_otg_update(dwc, 0);
- if (!dwc->desired_dr_role)
+ if (!desired_dr_role)
goto out;
- if (dwc->desired_dr_role == dwc->current_dr_role)
+ if (desired_dr_role == dwc->current_dr_role)
goto out;
- if (dwc->desired_dr_role == DWC3_GCTL_PRTCAP_OTG && dwc->edev)
+ if (desired_dr_role == DWC3_GCTL_PRTCAP_OTG && dwc->edev)
goto out;
switch (dwc->current_dr_role) {
@@ -164,7 +168,7 @@ static void __dwc3_set_mode(struct work_struct *work)
*/
if (dwc->current_dr_role && ((DWC3_IP_IS(DWC3) ||
DWC3_VER_IS_PRIOR(DWC31, 190A)) &&
- dwc->desired_dr_role != DWC3_GCTL_PRTCAP_OTG)) {
+ desired_dr_role != DWC3_GCTL_PRTCAP_OTG)) {
reg = dwc3_readl(dwc->regs, DWC3_GCTL);
reg |= DWC3_GCTL_CORESOFTRESET;
dwc3_writel(dwc->regs, DWC3_GCTL, reg);
@@ -184,11 +188,11 @@ static void __dwc3_set_mode(struct work_struct *work)
spin_lock_irqsave(&dwc->lock, flags);
- dwc3_set_prtcap(dwc, dwc->desired_dr_role);
+ dwc3_set_prtcap(dwc, desired_dr_role);
spin_unlock_irqrestore(&dwc->lock, flags);
- switch (dwc->desired_dr_role) {
+ switch (desired_dr_role) {
case DWC3_GCTL_PRTCAP_HOST:
ret = dwc3_host_init(dwc);
if (ret) {
The platforms based on SDM845 SoC locks the access to EDAC registers in the
bootloader. So probing the EDAC driver will result in a crash. Hence,
disable the creation of EDAC platform device on all SDM845 devices.
The issue has been observed on Lenovo Yoga C630 and DB845c.
Cc: <stable(a)vger.kernel.org> # 5.10
Reported-by: Steev Klimaszewski <steev(a)kali.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam(a)linaro.org>
---
drivers/soc/qcom/llcc-qcom.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 7b7c5a38bac6..8d840702df50 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -1012,11 +1012,18 @@ static int qcom_llcc_probe(struct platform_device *pdev)
drv_data->ecc_irq = platform_get_irq_optional(pdev, 0);
- llcc_edac = platform_device_register_data(&pdev->dev,
- "qcom_llcc_edac", -1, drv_data,
- sizeof(*drv_data));
- if (IS_ERR(llcc_edac))
- dev_err(dev, "Failed to register llcc edac driver\n");
+ /*
+ * The platforms based on SDM845 SoC locks the access to EDAC registers
+ * in bootloader. So probing the EDAC driver will result in a crash.
+ * Hence, disable the creation of EDAC platform device on SDM845.
+ */
+ if (!of_device_is_compatible(dev->of_node, "qcom,sdm845-llcc")) {
+ llcc_edac = platform_device_register_data(&pdev->dev,
+ "qcom_llcc_edac", -1, drv_data,
+ sizeof(*drv_data));
+ if (IS_ERR(llcc_edac))
+ dev_err(dev, "Failed to register llcc edac driver\n");
+ }
return 0;
err:
--
2.25.1
From: Nathan Lynch <nathanl(a)linux.ibm.com>
[ Upstream commit ed2213bfb192ab51f09f12e9b49b5d482c6493f3 ]
rtas_os_term() is called during panic. Its behavior depends on a couple
of conditions in the /rtas node of the device tree, the traversal of
which entails locking and local IRQ state changes. If the kernel panics
while devtree_lock is held, rtas_os_term() as currently written could
hang.
Instead of discovering the relevant characteristics at panic time,
cache them in file-static variables at boot. Note the lookup for
"ibm,extended-os-term" is converted to of_property_read_bool() since it
is a boolean property, not an RTAS function token.
Signed-off-by: Nathan Lynch <nathanl(a)linux.ibm.com>
Reviewed-by: Nicholas Piggin <npiggin(a)gmail.com>
Reviewed-by: Andrew Donnellan <ajd(a)linux.ibm.com>
[mpe: Incorporate suggested change from Nick]
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20221118150751.469393-4-nathanl@linux.ibm.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/powerpc/kernel/rtas.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index b492cb1c36fd..4c9ed28465b3 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -717,6 +717,7 @@ void __noreturn rtas_halt(void)
/* Must be in the RMO region, so we place it here */
static char rtas_os_term_buf[2048];
+static s32 ibm_os_term_token = RTAS_UNKNOWN_SERVICE;
void rtas_os_term(char *str)
{
@@ -728,14 +729,13 @@ void rtas_os_term(char *str)
* this property may terminate the partition which we want to avoid
* since it interferes with panic_timeout.
*/
- if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") ||
- RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term"))
+ if (ibm_os_term_token == RTAS_UNKNOWN_SERVICE)
return;
snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str);
do {
- status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL,
+ status = rtas_call(ibm_os_term_token, 1, 1, NULL,
__pa(rtas_os_term_buf));
} while (rtas_busy_delay(status));
@@ -1332,6 +1332,13 @@ void __init rtas_initialize(void)
no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry);
rtas.entry = no_entry ? rtas.base : entry;
+ /*
+ * Discover these now to avoid device tree lookups in the
+ * panic path.
+ */
+ if (of_property_read_bool(rtas.dev, "ibm,extended-os-term"))
+ ibm_os_term_token = rtas_token("ibm,os-term");
+
/* If RTAS was found, allocate the RMO buffer for it and look for
* the stop-self token if any
*/
--
2.35.1
From: Andre Przywara <andre.przywara(a)arm.com>
[ Upstream commit 0f607406525d25019dd9c498bcc0b42734fc59d5 ]
The USB PHY used in the Allwinner H616 SoC inherits some traits from its
various predecessors: it has four full PHYs like the H3, needs some
extra bits to be set like the H6, and puts SIDDQ on a different bit like
the A100. Plus it needs this weird PHY2 quirk.
Name all those properties in a new config struct and assign a new
compatible name to it.
Signed-off-by: Andre Przywara <andre.przywara(a)arm.com>
Reviewed-by: Samuel Holland <samuel(a)sholland.org>
Link: https://lore.kernel.org/r/20221031111358.3387297-5-andre.przywara@arm.com
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/phy/allwinner/phy-sun4i-usb.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/phy/allwinner/phy-sun4i-usb.c b/drivers/phy/allwinner/phy-sun4i-usb.c
index e5842e48a5e0..22938be176e7 100644
--- a/drivers/phy/allwinner/phy-sun4i-usb.c
+++ b/drivers/phy/allwinner/phy-sun4i-usb.c
@@ -973,6 +973,17 @@ static const struct sun4i_usb_phy_cfg sun50i_h6_cfg = {
.missing_phys = BIT(1) | BIT(2),
};
+static const struct sun4i_usb_phy_cfg sun50i_h616_cfg = {
+ .num_phys = 4,
+ .type = sun50i_h6_phy,
+ .disc_thresh = 3,
+ .phyctl_offset = REG_PHYCTL_A33,
+ .dedicated_clocks = true,
+ .phy0_dual_route = true,
+ .hci_phy_ctl_clear = PHY_CTL_SIDDQ,
+ .needs_phy2_siddq = true,
+};
+
static const struct of_device_id sun4i_usb_phy_of_match[] = {
{ .compatible = "allwinner,sun4i-a10-usb-phy", .data = &sun4i_a10_cfg },
{ .compatible = "allwinner,sun5i-a13-usb-phy", .data = &sun5i_a13_cfg },
@@ -987,6 +998,7 @@ static const struct of_device_id sun4i_usb_phy_of_match[] = {
{ .compatible = "allwinner,sun50i-a64-usb-phy",
.data = &sun50i_a64_cfg},
{ .compatible = "allwinner,sun50i-h6-usb-phy", .data = &sun50i_h6_cfg },
+ { .compatible = "allwinner,sun50i-h616-usb-phy", .data = &sun50i_h616_cfg },
{ },
};
MODULE_DEVICE_TABLE(of, sun4i_usb_phy_of_match);
--
2.35.1
Hi,
just a short note to report regular freezes with kernel 6.1.0 on a
haswell laptop quad core Intel Core i7-4750HQ (-MT MCP-) with integrated
graphics.
- system only freezes when launching the desktop environment (working on
a text console while having the sddm login screen up, without logging
in, does not seem to cause the issue);
- freezes happens a few seconds to a few minutes after getting to the
desktop environment (that uses opengl and composition). Freeze happens
both on X11 or Wayland.
- freeze seems to cause data loss (system not able to complete writes
when the freeze occurs, data structures on disk get corrupted, e.g.
system complained on broken btrfs snapshots made by timeshift-like app).
- system on freeze ceases responding to ping from the outside;
- upon reboot I cannot find any trace of any issue in the journal;
- on the same system booting kernels up to 6.0.14 is OK.
Seen using a distro kernel, but it should be fairly mainline (manjaro/arch).
Reported to the distro, but seems serious enough to report here too.
Thanks,
Sergio
Hi
Make us an offer on Original New sealed Box Cisco located in usa
C9400-LC-24XS
C9200L-48T-4G-A
C9200L-48T-4G-E
C9200L-48T-4X-E
WS-2960X 48-LPS-L New sealed box Cisco Qty 30
32GB2Rx4 PC4 2400T QTY: 100 $20 each
16GB 2RX8 PC4-3200AA-UB1-11 Qty 100 $18
...............................................
Take all memory for $1,400
4GB DDR3 DESKTOP 86PCS
4GB DDR4 DESKTOP 100PCS
4GB DDR4 LAPTOP 50PCS
8GB DDR3 DESKTOP 64PCS
8GB DDR4 DESKTOP 143PCS
8GB DDR4 LAPTOP 165 PCS.
16GB 2RX4 PC4-2133P-RBB-10 Qty 85
8GB DDR4 PC4-17000 CL15 260-PIN SODIMM Qty 190
We are looking for a buyer to move all @ $1000 USD
Regards,
Justin Gates
Server Rack Equipment
1343 No. 5 Road, Richmond,
British Columbia
V7A 4G1 Canada
Phone: +1 7783083945 | Fax: 778 308 4563
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 1993bf97992df2d560287f3c4120eda57426843d
Gitweb: https://git.kernel.org/tip/1993bf97992df2d560287f3c4120eda57426843d
Author: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
AuthorDate: Mon, 19 Dec 2022 23:35:10 +09:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Tue, 27 Dec 2022 12:51:58 +01:00
x86/kprobes: Fix kprobes instruction boudary check with CONFIG_RETHUNK
Since the CONFIG_RETHUNK and CONFIG_SLS will use INT3 for stopping
speculative execution after RET instruction, kprobes always failes to
check the probed instruction boundary by decoding the function body if
the probed address is after such sequence. (Note that some conditional
code blocks will be placed after function return, if compiler decides
it is not on the hot path.)
This is because kprobes expects kgdb puts the INT3 as a software
breakpoint and it will replace the original instruction.
But these INT3 are not such purpose, it doesn't need to recover the
original instruction.
To avoid this issue, kprobes checks whether the INT3 is owned by
kgdb or not, and if so, stop decoding and make it fail. The other
INT3 will come from CONFIG_RETHUNK/CONFIG_SLS and those can be
treated as a one-byte instruction.
Fixes: e463a09af2f0 ("x86: Add straight-line-speculation mitigation")
Suggested-by: Peter Zijlstra <peterz(a)infradead.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/167146051026.1374301.392728975473572291.stgit@dev…
---
arch/x86/kernel/kprobes/core.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6629968..b36f3c3 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -37,6 +37,7 @@
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
@@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr)
if (ret < 0)
return 0;
+#ifdef CONFIG_KGDB
/*
- * Another debugging subsystem might insert this breakpoint.
- * In that case, we can't recover it.
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
*/
- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
return 0;
+#endif
addr += insn.length;
}
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: 63dc6325ff41ee9e570bde705ac34a39c5dbeb44
Gitweb: https://git.kernel.org/tip/63dc6325ff41ee9e570bde705ac34a39c5dbeb44
Author: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
AuthorDate: Mon, 19 Dec 2022 23:35:19 +09:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Tue, 27 Dec 2022 12:51:58 +01:00
x86/kprobes: Fix optprobe optimization check with CONFIG_RETHUNK
Since the CONFIG_RETHUNK and CONFIG_SLS will use INT3 for stopping
speculative execution after function return, kprobe jump optimization
always fails on the functions with such INT3 inside the function body.
(It already checks the INT3 padding between functions, but not inside
the function)
To avoid this issue, as same as kprobes, check whether the INT3 comes
from kgdb or not, and if so, stop decoding and make it fail. The other
INT3 will come from CONFIG_RETHUNK/CONFIG_SLS and those can be
treated as a one-byte instruction.
Fixes: e463a09af2f0 ("x86: Add straight-line-speculation mitigation")
Suggested-by: Peter Zijlstra <peterz(a)infradead.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/167146051929.1374301.7419382929328081706.stgit@de…
---
arch/x86/kernel/kprobes/opt.c | 28 ++++++++--------------------
1 file changed, 8 insertions(+), 20 deletions(-)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index e6b8c53..e57e07b 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -15,6 +15,7 @@
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
@@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn)
return ret;
}
-static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
-{
- unsigned char ops;
-
- for (; addr < eaddr; addr++) {
- if (get_kernel_nofault(ops, (void *)addr) < 0 ||
- ops != INT3_INSN_OPCODE)
- return false;
- }
-
- return true;
-}
-
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr)
ret = insn_decode_kernel(&insn, (void *)recovered_insn);
if (ret < 0)
return 0;
-
+#ifdef CONFIG_KGDB
/*
- * In the case of detecting unknown breakpoint, this could be
- * a padding INT3 between functions. Let's check that all the
- * rest of the bytes are also INT3.
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
*/
- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
- return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
-
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return 0;
+#endif
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1c123c567fb138ebd187480b7fc0610fcb0851f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke(a)redhat.com>
Date: Thu, 15 Dec 2022 00:02:53 +0100
Subject: [PATCH] bpf: Resolve fext program type when checking map
compatibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The bpf_prog_map_compatible() check makes sure that BPF program types are
not mixed inside BPF map types that can contain programs (tail call maps,
cpumaps and devmaps). It does this by setting the fields of the map->owner
struct to the values of the first program being checked against, and
rejecting any subsequent programs if the values don't match.
One of the values being set in the map owner struct is the program type,
and since the code did not resolve the prog type for fext programs, the map
owner type would be set to PROG_TYPE_EXT and subsequent loading of programs
of the target type into the map would fail.
This bug is seen in particular for XDP programs that are loaded as
PROG_TYPE_EXT using libxdp; these cannot insert programs into devmaps and
cpumaps because the check fails as described above.
Fix the bug by resolving the fext program type to its target program type
as elsewhere in the verifier.
v3:
- Add Yonghong's ACK
Fixes: f45d5b6ce2e8 ("bpf: generalise tail call map compatibility check")
Acked-by: Yonghong Song <yhs(a)fb.com>
Signed-off-by: Toke Høiland-Jørgensen <toke(a)redhat.com>
Link: https://lore.kernel.org/r/20221214230254.790066-1-toke@redhat.com
Signed-off-by: Martin KaFai Lau <martin.lau(a)kernel.org>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f98dec6e90f..b334f4ddc4d5 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2092,6 +2092,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
if (fp->kprobe_override)
@@ -2102,12 +2103,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
/* There's no owner yet where we could check for
* compatibility.
*/
- map->owner.type = fp->type;
+ map->owner.type = prog_type;
map->owner.jited = fp->jited;
map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = map->owner.type == fp->type &&
+ ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
1c123c567fb1 ("bpf: Resolve fext program type when checking map compatibility")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1c123c567fb138ebd187480b7fc0610fcb0851f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= <toke(a)redhat.com>
Date: Thu, 15 Dec 2022 00:02:53 +0100
Subject: [PATCH] bpf: Resolve fext program type when checking map
compatibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The bpf_prog_map_compatible() check makes sure that BPF program types are
not mixed inside BPF map types that can contain programs (tail call maps,
cpumaps and devmaps). It does this by setting the fields of the map->owner
struct to the values of the first program being checked against, and
rejecting any subsequent programs if the values don't match.
One of the values being set in the map owner struct is the program type,
and since the code did not resolve the prog type for fext programs, the map
owner type would be set to PROG_TYPE_EXT and subsequent loading of programs
of the target type into the map would fail.
This bug is seen in particular for XDP programs that are loaded as
PROG_TYPE_EXT using libxdp; these cannot insert programs into devmaps and
cpumaps because the check fails as described above.
Fix the bug by resolving the fext program type to its target program type
as elsewhere in the verifier.
v3:
- Add Yonghong's ACK
Fixes: f45d5b6ce2e8 ("bpf: generalise tail call map compatibility check")
Acked-by: Yonghong Song <yhs(a)fb.com>
Signed-off-by: Toke Høiland-Jørgensen <toke(a)redhat.com>
Link: https://lore.kernel.org/r/20221214230254.790066-1-toke@redhat.com
Signed-off-by: Martin KaFai Lau <martin.lau(a)kernel.org>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f98dec6e90f..b334f4ddc4d5 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2092,6 +2092,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
if (fp->kprobe_override)
@@ -2102,12 +2103,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
/* There's no owner yet where we could check for
* compatibility.
*/
- map->owner.type = fp->type;
+ map->owner.type = prog_type;
map->owner.jited = fp->jited;
map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = map->owner.type == fp->type &&
+ ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
tb_retimer_scan() returns error even when on-board retimers are found.
Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned")
Cc: stable(a)vger.kernel.org
Signed-off-by: Utkarsh Patel <utkarsh.h.patel(a)intel.com>
---
Changes in V2:
1. Removed extra line between the Fixes tag and signed-off.
2. Added the tag for stable tree.
---
drivers/thunderbolt/retimer.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/thunderbolt/retimer.c b/drivers/thunderbolt/retimer.c
index 81252e31014a..6ebe7a2886ec 100644
--- a/drivers/thunderbolt/retimer.c
+++ b/drivers/thunderbolt/retimer.c
@@ -471,10 +471,9 @@ int tb_retimer_scan(struct tb_port *port, bool add)
break;
}
- if (!last_idx) {
- ret = 0;
+ ret = 0;
+ if (!last_idx)
goto out;
- }
/* Add on-board retimers if they do not exist already */
for (i = 1; i <= last_idx; i++) {
--
2.25.1
Hi Greg,
On Tue, Dec 27, 2022 at 04:00:37PM +0800, Ming Lei wrote:
> Hi Changhui,
>
> On Mon, Dec 26, 2022 at 11:11:44AM +0800, Changhui Zhong wrote:
> > Hello,
> > Below issue was triggered with ( v6.0.15-996-g988abd970566), pls help check it
>
> There isn't commit 988abd970566 in linux-6.0.y, so I guess the above
> build must integrate other patches not in 6.0.y
>
> From the source code in cki build[1], looks commit 80bd4a7aab4c ("blk-mq: move
> the srcu_struct used for quiescing to the tagset") is included, but
> commit 8537380bb988 ("blk-mq: skip non-mq queues in blk_mq_quiesce_queue")
> is missed, that is why this panic is triggered.
I just found that patch of blk-mq-move-the-srcu_struct-used-for-quiescing-to-th.patch
is queued in stable-queue/queue-6.0, but that patch depends on
commit 8537380bb988 ("blk-mq: skip non-mq queues in blk_mq_quiesce_queue").
which needs to be added to queue-6.0 too.
Thanks,
Ming
The result of __dev_get_by_index() is not checked for NULL, which then
passed to mlx5e_attach_encap() and gets dereferenced.
This patch backports commit <fe7738e> to correct the issue in 5.10 branch.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
On Sat, Dec 24, 2022 at 7:48 AM Andreas Ziegler <br015(a)umbiko.net> wrote:
>
> -- Observed in, but not limited to, Linux 6.1.1
Wait, "but not limited to"? What does that mean? Are there more
versions affected?
-- Slade
The coreboot_table driver registers a coreboot bus while probing a
"coreboot_table" device representing the coreboot table memory region.
Probing this device (i.e., registering the bus) is a dependency for the
module_init() functions of any driver for this bus (e.g.,
memconsole-coreboot.c / memconsole_driver_init()).
With synchronous probe, this dependency works OK, as the link order in
the Makefile ensures coreboot_table_driver_init() (and thus,
coreboot_table_probe()) completes before a coreboot device driver tries
to add itself to the bus.
With asynchronous probe, however, coreboot_table_probe() may race with
memconsole_driver_init(), and so we're liable to hit one of these two:
1. coreboot_driver_register() eventually hits "[...] the bus was not
initialized.", and the memconsole driver fails to register; or
2. coreboot_driver_register() gets past #1, but still races with
bus_register() and hits some other undefined/crashing behavior (e.g.,
in driver_find() [1])
We can resolve this by registering the bus in our initcall, and only
deferring "device" work (scanning the coreboot memory region and
creating sub-devices) to probe().
[1] Example failure, using 'driver_async_probe=*' kernel command line:
[ 0.114217] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000010
...
[ 0.114307] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc1 #63
[ 0.114316] Hardware name: Google Scarlet (DT)
...
[ 0.114488] Call trace:
[ 0.114494] _raw_spin_lock+0x34/0x60
[ 0.114502] kset_find_obj+0x28/0x84
[ 0.114511] driver_find+0x30/0x50
[ 0.114520] driver_register+0x64/0x10c
[ 0.114528] coreboot_driver_register+0x30/0x3c
[ 0.114540] memconsole_driver_init+0x24/0x30
[ 0.114550] do_one_initcall+0x154/0x2e0
[ 0.114560] do_initcall_level+0x134/0x160
[ 0.114571] do_initcalls+0x60/0xa0
[ 0.114579] do_basic_setup+0x28/0x34
[ 0.114588] kernel_init_freeable+0xf8/0x150
[ 0.114596] kernel_init+0x2c/0x12c
[ 0.114607] ret_from_fork+0x10/0x20
[ 0.114624] Code: 5280002b 1100054a b900092a f9800011 (885ffc01)
[ 0.114631] ---[ end trace 0000000000000000 ]---
Fixes: b81e3140e412 ("firmware: coreboot: Make bus registration symmetric")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Brian Norris <briannorris(a)chromium.org>
---
Currently, get_maintainers.pl tells me Greg should pick this up. But I
CC the chrome-platform list too, since it seems reasonable for Google
folks (probably ChromeOS folks are most active here?) to maintain
Google/Chrome drivers.
Let me know if y'all would like this official, and I'll push out a
MAINTAINERS patch.
drivers/firmware/google/coreboot_table.c | 37 +++++++++++++++++++-----
1 file changed, 29 insertions(+), 8 deletions(-)
diff --git a/drivers/firmware/google/coreboot_table.c b/drivers/firmware/google/coreboot_table.c
index c52bcaa9def6..9ca21feb9d45 100644
--- a/drivers/firmware/google/coreboot_table.c
+++ b/drivers/firmware/google/coreboot_table.c
@@ -149,12 +149,8 @@ static int coreboot_table_probe(struct platform_device *pdev)
if (!ptr)
return -ENOMEM;
- ret = bus_register(&coreboot_bus_type);
- if (!ret) {
- ret = coreboot_table_populate(dev, ptr);
- if (ret)
- bus_unregister(&coreboot_bus_type);
- }
+ ret = coreboot_table_populate(dev, ptr);
+
memunmap(ptr);
return ret;
@@ -169,7 +165,6 @@ static int __cb_dev_unregister(struct device *dev, void *dummy)
static int coreboot_table_remove(struct platform_device *pdev)
{
bus_for_each_dev(&coreboot_bus_type, NULL, NULL, __cb_dev_unregister);
- bus_unregister(&coreboot_bus_type);
return 0;
}
@@ -199,6 +194,32 @@ static struct platform_driver coreboot_table_driver = {
.of_match_table = of_match_ptr(coreboot_of_match),
},
};
-module_platform_driver(coreboot_table_driver);
+
+static int __init coreboot_table_driver_init(void)
+{
+ int ret;
+
+ ret = bus_register(&coreboot_bus_type);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&coreboot_table_driver);
+ if (ret) {
+ bus_unregister(&coreboot_bus_type);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit coreboot_table_driver_exit(void)
+{
+ platform_driver_unregister(&coreboot_table_driver);
+ bus_unregister(&coreboot_bus_type);
+}
+
+module_init(coreboot_table_driver_init);
+module_exit(coreboot_table_driver_exit);
+
MODULE_AUTHOR("Google, Inc.");
MODULE_LICENSE("GPL");
--
2.38.0.413.g74048e4d9e-goog
Hi Sasha,
On Sun, Dec 25, 2022 at 10:14:37AM -0500, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> net: dpaa2: publish MAC stringset to ethtool -S even if MAC is missing
>
> to the 5.15-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> net-dpaa2-publish-mac-stringset-to-ethtool-s-even-if.patch
> and it can be found in the queue-5.15 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
Didn't we just discuss that this patch should be dropped from the stable
queues for 5.15, 6.0 and 6.1, and didn't you just say that you'll drop it?
https://lore.kernel.org/netdev/Y6ZH4YCuBSiPDMNd@sashalap/
From: "Tyler Hicks" <code(a)tyhicks.com>
The backport of commit 05c2224d4b04 ("KVM: selftests: Fix number of
pages for memory slot in memslot_modification_stress_test") broke the
build of the KVM selftest memslot_modification_stress_test.c source file
in two ways:
- Incorrectly assumed that max_t() was defined despite commit
5cf67a6051ea ("tools/include: Add _RET_IP_ and math definitions to
kernel.h") not being present
- Incorrectly assumed that kvm_vm struct members could be directly
accessed despite b530eba14c70 ("KVM: selftests: Get rid of
kvm_util_internal.h") not being present
Backport the first commit, as it is simple enough. Work around the lack
of the second commit by using the accessors to get to the kvm_vm struct
members.
Note that the linux-6.0.y backport of commit 05c2224d4b04 ("KVM:
selftests: Fix number of pages for memory slot in
memslot_modification_stress_test") is fine because the two prerequisite
commits, mentioned above, are both present in v6.0.
Tyler
Karolina Drobnik (1):
tools/include: Add _RET_IP_ and math definitions to kernel.h
Tyler Hicks (Microsoft) (1):
KVM: selftests: Fix build regression by using accessor function
tools/include/linux/kernel.h | 6 ++++++
.../selftests/kvm/memslot_modification_stress_test.c | 2 +-
2 files changed, 7 insertions(+), 1 deletion(-)
--
2.34.1
The patch titled
Subject: mm/shmem: restore SHMEM_HUGE_DENY precedence over MADV_COLLAPSE
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-shmem-restore-shmem_huge_deny-precedence-over-madv_collapse.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Zach O'Keefe" <zokeefe(a)google.com>
Subject: mm/shmem: restore SHMEM_HUGE_DENY precedence over MADV_COLLAPSE
Date: Sat, 24 Dec 2022 00:20:35 -0800
SHMEM_HUGE_DENY is for emergency use by the admin, to disable allocation
of shmem huge pages if, for example, a dangerous bug is found in their
usage: see "deny" in Documentation/mm/transhuge.rst. An app using
madvise(,,MADV_COLLAPSE) should not be allowed to override it: restore its
precedence over shmem_huge_force.
Restore SHMEM_HUGE_DENY precedence over MADV_COLLAPSE.
Link: https://lkml.kernel.org/r/20221224082035.3197140-2-zokeefe@google.com
Fixes: 7c6c6cc4d3a2 ("mm/shmem: add flag to enforce shmem THP in hugepage_vma_check()")
Signed-off-by: Zach O'Keefe <zokeefe(a)google.com>
Suggested-by: Hugh Dickins <hughd(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/mm/shmem.c~mm-shmem-restore-shmem_huge_deny-precedence-over-madv_collapse
+++ a/mm/shmem.c
@@ -478,12 +478,10 @@ bool shmem_is_huge(struct vm_area_struct
if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
return false;
- if (shmem_huge_force)
- return true;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- return true;
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
+ if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
_
Patches currently in -mm which might be from zokeefe(a)google.com are
mm-madv_collapse-dont-expand-collapse-when-vm_end-is-past-requested-end.patch
mm-shmem-restore-shmem_huge_deny-precedence-over-madv_collapse.patch
The patch titled
Subject: mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested end
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-madv_collapse-dont-expand-collapse-when-vm_end-is-past-requested-end.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Zach O'Keefe" <zokeefe(a)google.com>
Subject: mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested end
Date: Sat, 24 Dec 2022 00:20:34 -0800
MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until
it has collapsed all eligible memory contained within the bounds supplied
by the user.
At the top of each hugepage iteration we (re)lock mmap_lock and
(re)validate the VMA for eligibility and update variables that might have
changed while mmap_lock was dropped. One thing that might occur is that
the VMA could be resized, and as such, we refetch vma->vm_end to make sure
we don't collapse past the end of the VMA's new end.
However, it's possible that when refetching vma->vm_end that we expand the
region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len
supplied by the user.
The consequence here is that we may attempt to collapse more memory than
requested, possibly yielding either "too much success" or "false failure"
user-visible results. An example of the former is if we MADV_COLLAPSE the
first 4MiB of a 2TiB mmap()'d file, the incorrect refetch would cause the
operation to block for much longer than anticipated as we attempt to
collapse the entire TiB region. An example of the latter is that applying
MADV_COLLPSE to a 4MiB file mapped to the start of a 6MiB VMA will
successfully collapse the first 4MiB, then incorrectly attempt to collapse
the last hugepage-aligned/sized region -- fail (since readahead/page cache
lookup will fail) -- and report a failure to the user.
Don't expand the acted-on region when refetching vma->vm_end.
Link: https://lkml.kernel.org/r/20221224082035.3197140-1-zokeefe@google.com
Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock")
Signed-off-by: Zach O'Keefe <zokeefe(a)google.com>
Reported-by: Hugh Dickins <hughd(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/mm/khugepaged.c~mm-madv_collapse-dont-expand-collapse-when-vm_end-is-past-requested-end
+++ a/mm/khugepaged.c
@@ -2647,7 +2647,7 @@ int madvise_collapse(struct vm_area_stru
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
_
Patches currently in -mm which might be from zokeefe(a)google.com are
mm-madv_collapse-dont-expand-collapse-when-vm_end-is-past-requested-end.patch
mm-shmem-restore-shmem_huge_deny-precedence-over-madv_collapse.patch
A recent development on the EFI front has resulted in guests having
their page tables baked in the firmware binary, and mapped into
the IPA space as part as a read-only memslot.
Not only this is legitimate, but it also results in added security,
so thumbs up. However, this clashes mildly with our handling of a S1PTW
as a write to correctly handle AF/DB updates to the S1 PTs, and results
in the guest taking an abort it won't recover from (the PTs mapping the
vectors will suffer freom the same problem...).
So clearly our handling is... wrong.
Instead, switch to a two-pronged approach:
- On S1PTW translation fault, handle the fault as a read
- On S1PTW permission fault, handle the fault as a write
This is of no consequence to SW that *writes* to its PTs (the write
will trigger a non-S1PTW fault), and SW that uses RO PTs will not
use AF/DB anyway, as that'd be wrong.
Only in the case described in c4ad98e4b72c ("KVM: arm64: Assume write
fault on S1PTW permission fault on instruction fetch") do we end-up
with two back-to-back faults (page being evicted and faulted back).
I don't think this is a case worth optimising for.
Fixes: c4ad98e4b72c ("KVM: arm64: Assume write fault on S1PTW permission fault on instruction fetch")
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/arm64/include/asm/kvm_emulate.h | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 9bdba47f7e14..fd6ad8b21f85 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -373,8 +373,26 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_abt_iss1tw(vcpu))
- return true;
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
+ /*
+ * Only a permission fault on a S1PTW should be
+ * considered as a write. Otherwise, page tables baked
+ * in a read-only memslot will result in an exception
+ * being delivered in the guest.
+ *
+ * The drawback is that we end-up fauling twice if the
+ * guest is using any of HW AF/DB: a translation fault
+ * to map the page containing the PT (read only at
+ * first), then a permission fault to allow the flags
+ * to be set.
+ */
+ switch (kvm_vcpu_trap_get_fault_type(vcpu)) {
+ case ESR_ELx_FSC_PERM:
+ return true;
+ default:
+ return false;
+ }
+ }
if (kvm_vcpu_trap_is_iabt(vcpu))
return false;
--
2.34.1
The following changes since commit 830b3c68c1fb1e9176028d02ef86f3cf76aa2476:
Linux 6.1 (2022-12-11 14:15:18 -0800)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
for you to fetch changes up to 98dd6b2ef50d6f7876606a86c8d8a767c9fef6f5:
virtio_blk: mark all zone fields LE (2022-12-22 14:32:36 -0500)
Note: merging this upstream results in a conflict
between commit:
de4eda9de2d9 ("use less confusing names for iov_iter direction initializers")
from Linus' tree and commit:
("virtio/vsock: replace virtio_vsock_pkt with sk_buff")
from this tree.
This resolution below fixes it up, due to Stephen Rothwell
diff --cc drivers/vhost/vsock.c
index cd6f7776013a,830bc823addc..000000000000
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@@ -165,8 -157,9 +157,9 @@@ vhost_transport_do_send_pkt(struct vhos
break;
}
- iov_iter_init(&iov_iter, READ, &vq->iov[out], in, iov_len);
+ iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
- payload_len = pkt->len - pkt->off;
+ payload_len = skb->len;
+ hdr = virtio_vsock_hdr(skb);
/* If the packet is greater than the space available in the
* buffer, we split it using multiple buffers.
@@@ -366,18 -340,21 +340,22 @@@ vhost_vsock_alloc_skb(struct vhost_virt
return NULL;
}
- pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
- if (!pkt)
+ len = iov_length(vq->iov, out);
+
+ /* len contains both payload and hdr */
+ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
+ if (!skb)
return NULL;
- iov_iter_init(&iov_iter, WRITE, vq->iov, out, len);
+ len = iov_length(vq->iov, out);
+ iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len);
- nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
- if (nbytes != sizeof(pkt->hdr)) {
+ hdr = virtio_vsock_hdr(skb);
+ nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter);
+ if (nbytes != sizeof(*hdr)) {
vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
- sizeof(pkt->hdr), nbytes);
- kfree(pkt);
+ sizeof(*hdr), nbytes);
+ kfree_skb(skb);
return NULL;
}
It can also be found in linux-next, see next-20221220.
----------------------------------------------------------------
virtio,vhost,vdpa: features, fixes, cleanups
zoned block device support
lifetime stats support (for virtio devices backed by memory supporting that)
vsock rework to use skbuffs
ifcvf features provisioning
new SolidNET DPU driver
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
----------------------------------------------------------------
Alvaro Karsz (5):
Add SolidRun vendor id
New PCI quirk for SolidRun SNET DPU.
virtio: vdpa: new SolidNET DPU driver.
virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support
virtio: vdpa: fix snprintf size argument in snet_vdpa driver
Angus Chen (2):
virtio_pci: modify ENOENT to EINVAL
virtio_blk: use UINT_MAX instead of -1U
Bobby Eshleman (1):
virtio/vsock: replace virtio_vsock_pkt with sk_buff
Cindy Lu (2):
vhost_vdpa: fix the crash in unmap a large memory
vdpa_sim_net: should not drop the multicast/broadcast packet
Colin Ian King (1):
RDMA/mlx5: remove variable i
Davidlohr Bueso (2):
tools/virtio: remove stray characters
tools/virtio: remove smp_read_barrier_depends()
Dawei Li (1):
virtio: Implementing attribute show with sysfs_emit
Dmitry Fomichev (2):
virtio-blk: use a helper to handle request queuing errors
virtio-blk: add support for zoned block devices
Eli Cohen (8):
vdpa/mlx5: Fix rule forwarding VLAN to TIR
vdpa/mlx5: Return error on vlan ctrl commands if not supported
vdpa/mlx5: Fix wrong mac address deletion
vdpa/mlx5: Avoid using reslock in event_handler
vdpa/mlx5: Avoid overwriting CVQ iotlb
vdpa/mlx5: Move some definitions to a new header file
vdpa/mlx5: Add debugfs subtree
vdpa/mlx5: Add RX counters to debugfs
Eugenio Pérez (1):
vdpa_sim_net: Offer VIRTIO_NET_F_STATUS
Harshit Mogalapalli (1):
vduse: Validate vq_num in vduse_validate_config()
Jason Wang (2):
vdpa: conditionally fill max max queue pair for stats
vdpasim: fix memory leak when freeing IOTLBs
Michael S. Tsirkin (3):
virtio_blk: temporary variable type tweak
virtio_blk: zone append in header type tweak
virtio_blk: mark all zone fields LE
Michael Sammler (1):
virtio_pmem: populate numa information
Rafael Mendonca (1):
virtio_blk: Fix signedness bug in virtblk_prep_rq()
Ricardo Cañuelo (2):
tools/virtio: initialize spinlocks in vring_test.c
docs: driver-api: virtio: virtio on Linux
Rong Wang (1):
vdpa/vp_vdpa: fix kfree a wrong pointer in vp_vdpa_remove
Shaomin Deng (1):
tools: Delete the unneeded semicolon after curly braces
Shaoqin Huang (2):
virtio_pci: use helper function is_power_of_2()
virtio_ring: use helper function is_power_of_2()
Si-Wei Liu (1):
vdpa: merge functionally duplicated dev_features attributes
Stefano Garzarella (4):
vringh: fix range used in iotlb_translate()
vhost: fix range used in translate_desc()
vhost-vdpa: fix an iotlb memory leak
vdpa_sim: fix vringh initialization in vdpasim_queue_ready()
Wei Yongjun (1):
virtio-crypto: fix memory leak in virtio_crypto_alg_skcipher_close_session()
Yuan Can (1):
vhost/vsock: Fix error handling in vhost_vsock_init()
Zhu Lingshan (12):
vDPA/ifcvf: decouple hw features manipulators from the adapter
vDPA/ifcvf: decouple config space ops from the adapter
vDPA/ifcvf: alloc the mgmt_dev before the adapter
vDPA/ifcvf: decouple vq IRQ releasers from the adapter
vDPA/ifcvf: decouple config IRQ releaser from the adapter
vDPA/ifcvf: decouple vq irq requester from the adapter
vDPA/ifcvf: decouple config/dev IRQ requester and vectors allocator from the adapter
vDPA/ifcvf: ifcvf_request_irq works on ifcvf_hw
vDPA/ifcvf: manage ifcvf_hw in the mgmt_dev
vDPA/ifcvf: allocate the adapter in dev_add()
vDPA/ifcvf: retire ifcvf_private_to_vf
vDPA/ifcvf: implement features provisioning
ruanjinjie (1):
vdpa_sim: fix possible memory leak in vdpasim_net_init() and vdpasim_blk_init()
wangjianli (1):
tools/virtio: Variable type completion
Documentation/driver-api/index.rst | 1 +
Documentation/driver-api/virtio/index.rst | 11 +
Documentation/driver-api/virtio/virtio.rst | 144 +++
.../driver-api/virtio/writing_virtio_drivers.rst | 197 ++++
MAINTAINERS | 6 +
drivers/block/virtio_blk.c | 522 ++++++++-
.../crypto/virtio/virtio_crypto_skcipher_algs.c | 3 +-
drivers/nvdimm/virtio_pmem.c | 11 +-
drivers/pci/quirks.c | 8 +
drivers/vdpa/Kconfig | 22 +
drivers/vdpa/Makefile | 1 +
drivers/vdpa/ifcvf/ifcvf_base.c | 32 +-
drivers/vdpa/ifcvf/ifcvf_base.h | 10 +-
drivers/vdpa/ifcvf/ifcvf_main.c | 162 ++-
drivers/vdpa/mlx5/Makefile | 2 +-
drivers/vdpa/mlx5/core/mlx5_vdpa.h | 5 +-
drivers/vdpa/mlx5/core/mr.c | 46 +-
drivers/vdpa/mlx5/net/debug.c | 152 +++
drivers/vdpa/mlx5/net/mlx5_vnet.c | 252 +++--
drivers/vdpa/mlx5/net/mlx5_vnet.h | 94 ++
drivers/vdpa/solidrun/Makefile | 6 +
drivers/vdpa/solidrun/snet_hwmon.c | 188 ++++
drivers/vdpa/solidrun/snet_main.c | 1111 ++++++++++++++++++++
drivers/vdpa/solidrun/snet_vdpa.h | 196 ++++
drivers/vdpa/vdpa.c | 11 +-
drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +-
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 4 +-
drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 8 +-
drivers/vdpa/vdpa_user/vduse_dev.c | 3 +
drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +-
drivers/vhost/vdpa.c | 52 +-
drivers/vhost/vhost.c | 4 +-
drivers/vhost/vringh.c | 5 +-
drivers/vhost/vsock.c | 224 ++--
drivers/virtio/virtio.c | 12 +-
drivers/virtio/virtio_pci_modern.c | 4 +-
drivers/virtio/virtio_ring.c | 2 +-
include/linux/pci_ids.h | 2 +
include/linux/virtio_config.h | 8 +-
include/linux/virtio_vsock.h | 126 ++-
include/uapi/linux/vdpa.h | 4 +-
include/uapi/linux/virtio_blk.h | 133 +++
include/uapi/linux/virtio_blk_ioctl.h | 44 +
net/vmw_vsock/virtio_transport.c | 149 +--
net/vmw_vsock/virtio_transport_common.c | 420 ++++----
net/vmw_vsock/vsock_loopback.c | 51 +-
tools/virtio/ringtest/main.h | 37 +-
tools/virtio/virtio-trace/trace-agent-ctl.c | 2 +-
tools/virtio/virtio_test.c | 2 +-
tools/virtio/vringh_test.c | 2 +
50 files changed, 3661 insertions(+), 839 deletions(-)
create mode 100644 Documentation/driver-api/virtio/index.rst
create mode 100644 Documentation/driver-api/virtio/virtio.rst
create mode 100644 Documentation/driver-api/virtio/writing_virtio_drivers.rst
create mode 100644 drivers/vdpa/mlx5/net/debug.c
create mode 100644 drivers/vdpa/mlx5/net/mlx5_vnet.h
create mode 100644 drivers/vdpa/solidrun/Makefile
create mode 100644 drivers/vdpa/solidrun/snet_hwmon.c
create mode 100644 drivers/vdpa/solidrun/snet_main.c
create mode 100644 drivers/vdpa/solidrun/snet_vdpa.h
create mode 100644 include/uapi/linux/virtio_blk_ioctl.h
MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until
it has collapsed all eligible memory contained within the bounds
supplied by the user.
At the top of each hugepage iteration we (re)lock mmap_lock and
(re)validate the VMA for eligibility and update variables that might
have changed while mmap_lock was dropped. One thing that might occur,
is that the VMA could be resized, and as such, we refetch vma->vm_end
to make sure we don't collapse past the end of the VMA's new end.
However, it's possible that when refetching vma>vm_end that we expand the
region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len
supplied by the user.
The consequence here is that we may attempt to collapse more memory than
requested, possibly yielding either "too much success" or "false
failure" user-visible results. An example of the former is if we
MADV_COLLAPSE the first 4MiB of a 2TiB mmap()'d file, the incorrect
refetch would cause the operation to block for much longer than
anticipated as we attempt to collapse the entire TiB region. An example
of the latter is that applying MADV_COLLPSE to a 4MiB file mapped to the
start of a 6MiB VMA will successfully collapse the first 4MiB, then
incorrectly attempt to collapse the last hugepage-aligned/sized region
-- fail (since readahead/page cache lookup will fail) -- and report a
failure to the user.
Don't expand the acted-on region when refetching vma->vm_end.
Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock")
Reported-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Zach O'Keefe <zokeefe(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: stable(a)vger.kernel.org
---
v2->v3: Add 'Cc: stable(a)vger.kernel.org' as per stable-kernel-rules.
v1->v2: Updated changelog to make clear what user-visible issues this
patch addresses, as well makes the case for backporting (Andrew
Morton).
While there aren't any stability risks, without this patch there exist
trivial examples where MADV_COLLAPSE won't work; as such, this should be
backported to stable 6.1.X to make MADV_COLLAPSE dependable in such
cases.
v1: https://lore.kernel.org/linux-mm/CAAa6QmRx_b2UCJWE2XZ3=3c3-_N3R4cDGX6Wm4OT7…
---
mm/khugepaged.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5cb401aa2b9d..b4d2ec0a94ed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2649,7 +2649,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
--
2.39.0.314.g84b9a713c41-goog
MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until
it has collapsed all eligible memory contained within the bounds
supplied by the user.
At the top of each hugepage iteration we (re)lock mmap_lock and
(re)validate the VMA for eligibility and update variables that might
have changed while mmap_lock was dropped. One thing that might occur,
is that the VMA could be resized, and as such, we refetch vma->vm_end
to make sure we don't collapse past the end of the VMA's new end.
However, it's possible that when refetching vma>vm_end that we expand the
region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len
supplied by the user.
The consequence here is that we may attempt to collapse more memory than
requested, possibly yielding either "too much success" or "false
failure" user-visible results. An example of the former is if we
MADV_COLLAPSE the first 4MiB of a 2TiB mmap()'d file, the incorrect
refetch would cause the operation to block for much longer than
anticipated as we attempt to collapse the entire TiB region. An example
of the latter is that applying MADV_COLLPSE to a 4MiB file mapped to the
start of a 6MiB VMA will successfully collapse the first 4MiB, then
incorrectly attempt to collapse the last hugepage-aligned/sized region
-- fail (since readahead/page cache lookup will fail) -- and report a
failure to the user.
Don't expand the acted-on region when refetching vma->vm_end.
Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock")
Reported-by: Hugh Dickins <hughd(a)google.com>
Signed-off-by: Zach O'Keefe <zokeefe(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
---
v1->v2 : Updated changelog to make clear what user-visible issues this
patch addresses, as well makes the case for backporting (Andrew
Morton).
While there aren't any stability risks, without this patch there exist
trivial examples where MADV_COLLAPSE won't work; as such, this should be
backported to stable 6.1.X to make MADV_COLLAPSE dependable in such
cases.
v1: https://lore.kernel.org/linux-mm/CAAa6QmRx_b2UCJWE2XZ3=3c3-_N3R4cDGX6Wm4OT7…
---
mm/khugepaged.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 5cb401aa2b9d..b4d2ec0a94ed 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2649,7 +2649,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out_nolock;
}
- hend = vma->vm_end & HPAGE_PMD_MASK;
+ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
memset(cc->node_load, 0, sizeof(cc->node_load));
--
2.39.0.314.g84b9a713c41-goog
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index bbbb067de8ec..be01d49abb62 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -806,6 +806,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register - initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -816,8 +819,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index bbbb067de8ec..be01d49abb62 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -806,6 +806,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register - initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -816,8 +819,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index 97d86dc17c42..1eb1230a2d28 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -821,6 +821,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register - initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -831,8 +834,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index 0c6d17503a11..3ce572d7c26d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -869,6 +869,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register() - Initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -879,8 +882,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index ea53b30cf483..743e629d60d2 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -866,6 +866,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register() - Initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -876,8 +879,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index ea53b30cf483..743e629d60d2 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -866,6 +866,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register() - Initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -876,8 +879,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
From: Yang Yingliang <yangyingliang(a)huawei.com>
[ Upstream commit 1662cea4623f75d8251adf07370bbaa958f0355d ]
Inject fault while loading module, kset_register() may fail.
If it fails, the kset.kobj.name allocated by kobject_set_name()
which must be called before a call to kset_register() may be
leaked, since refcount of kobj was set in kset_init().
To mitigate this, we free the name in kset_register() when an
error is encountered, i.e. when kset_register() returns an error.
A kset may be embedded in a larger structure which may be dynamically
allocated in callers, it needs to be freed in ktype.release() or error
path in callers, in this case, we can not call kset_put() in kset_register(),
or it will cause double free, so just call kfree_const() to free the
name and set it to NULL to avoid accessing bad pointer in callers.
With this fix, the callers don't need care about freeing the name
and may call kset_put() if kset_register() fails.
Suggested-by: Luben Tuikov <luben.tuikov(a)amd.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
Reviewed-by: <luben.tuikov(a)amd.com>
Link: https://lore.kernel.org/r/20221025071549.1280528-1-yangyingliang@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
lib/kobject.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/lib/kobject.c b/lib/kobject.c
index 5f0e71ab292c..0f9cc0b93d99 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -834,6 +834,9 @@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops);
/**
* kset_register() - Initialize and add a kset.
* @k: kset.
+ *
+ * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
+ * is freed, it can not be used any more.
*/
int kset_register(struct kset *k)
{
@@ -844,8 +847,12 @@ int kset_register(struct kset *k)
kset_init(k);
err = kobject_add_internal(&k->kobj);
- if (err)
+ if (err) {
+ kfree_const(k->kobj.name);
+ /* Set it to NULL to avoid accessing bad pointer in callers. */
+ k->kobj.name = NULL;
return err;
+ }
kobject_uevent(&k->kobj, KOBJ_ADD);
return 0;
}
--
2.35.1
The patch titled
Subject: mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA
Date: Fri, 9 Dec 2022 09:09:12 +0100
Currently, we don't enable writenotify when enabling userfaultfd-wp on a
shared writable mapping (for now only shmem and hugetlb). The consequence
is that vma->vm_page_prot will still include write permissions, to be set
as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting,
page migration, ...).
So far, vma->vm_page_prot is assumed to be a safe default, meaning that we
only add permissions (e.g., mkwrite) but not remove permissions (e.g.,
wrprotect). For example, when enabling softdirty tracking, we enable
writenotify. With uffd-wp on shared mappings, that changed. More details
on vma->vm_page_prot semantics were summarized in [1].
This is problematic for uffd-wp: we'd have to manually check for a uffd-wp
PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone.
Prone to such issues is any code that uses vma->vm_page_prot to set PTE
permissions: primarily pte_modify() and mk_pte().
Instead, let's enable writenotify such that PTEs/PMDs/... will be mapped
write-protected as default and we will only allow selected PTEs that are
definitely safe to be mapped without write-protection (see
can_change_pte_writable()) to be writable. In the future, we might want
to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more
locations, for example, also when removing uffd-wp protection.
This fixes two known cases:
(a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting
in uffd-wp not triggering on write access.
(b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs
writable, resulting in uffd-wp not triggering on write access.
Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even
without NUMA hinting (which currently doesn't seem to be applicable to
shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA. On
such a VMA, userfaultfd-wp is currently non-functional.
Note that when enabling userfaultfd-wp, there is no need to walk page
tables to enforce the new default protection for the PTEs: we know that
they cannot be uffd-wp'ed yet, because that can only happen after enabling
uffd-wp for the VMA in general.
Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not
accidentally set the write bit -- which would result in uffd-wp not
triggering on later write access. This commit makes uffd-wp on shmem
behave just like uffd-wp on anonymous memory in that regard, even though,
mixing mprotect with uffd-wp is controversial.
[1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com
Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Reported-by: Ives van Hoorne <ives(a)codesandbox.io>
Debugged-by: Peter Xu <peterx(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Alistair Popple <apopple(a)nvidia.com>
Cc: Mike Rapoport <rppt(a)linux.vnet.ibm.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/userfaultfd.c | 28 ++++++++++++++++++++++------
mm/mmap.c | 4 ++++
2 files changed, 26 insertions(+), 6 deletions(-)
--- a/fs/userfaultfd.c~mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma
+++ a/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(s
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+
+ vma->vm_flags = flags;
+ /*
+ * For shared mappings, we want to enable writenotify while
+ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
+ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
+ */
+ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
+ vma_set_page_prot(vma);
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_compl
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma,
+ vma->vm_flags & ~__VM_UFFD_FLAGS);
}
}
mmap_write_unlock(mm);
@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struc
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
}
@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_a
} else {
/* Drop uffd context if remap feature not enabled */
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
}
@@ -895,7 +911,7 @@ static int userfaultfd_release(struct in
prev = vma;
}
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
mmap_write_unlock(mm);
@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct u
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
@@ -1648,7 +1664,7 @@ static int userfaultfd_unregister(struct
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
skip:
--- a/mm/mmap.c~mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma
+++ a/mm/mmap.c
@@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area
if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
return 1;
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_wp(vma))
+ return 1;
+
/* Specialty mapping? */
if (vm_flags & VM_PFNMAP)
return 0;
_
Patches currently in -mm which might be from david(a)redhat.com are
mm-swap-fix-swp_pfn_bits-with-config_phys_addr_t_64bit-on-32bit.patch
mm-swap-fix-swp_pfn_bits-with-config_phys_addr_t_64bit-on-32bit-v2.patch
mm-swap-fix-swp_pfn_bits-with-config_phys_addr_t_64bit-on-32bit-fix.patch
mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch
selftests-vm-add-ksm-unmerge-tests.patch
mm-pagewalk-dont-trigger-test_walk-in-walk_page_vma.patch
selftests-vm-add-test-to-measure-madv_unmergeable-performance.patch
mm-ksm-simplify-break_ksm-to-not-rely-on-vm_fault_write.patch
mm-remove-vm_fault_write.patch
mm-ksm-fix-ksm-cow-breaking-with-userfaultfd-wp-via-fault_flag_unshare.patch
mm-pagewalk-add-walk_page_range_vma.patch
mm-ksm-convert-break_ksm-to-use-walk_page_range_vma.patch
mm-gup-remove-foll_migration.patch
mm-gup_test-fix-pin_longterm_test_read-with-highmem.patch
selftests-vm-madv_populate-fix-missing-madv_populate_readwrite-definitions.patch
selftests-vm-cow-fix-compile-warning-on-32bit.patch
selftests-vm-ksm_functional_tests-fixes-for-32bit.patch
Hello stable team and Greg,
There are 3 commits in Linus' tree for the rtc driver which should be
merged against stable 6.0.y (they're already in 6.1 / 6.1.y).
Without the first two, a x86-64 machine might panic during boot (Mel saw
a 50% chance of panic at boot - 5 out of 10 tries - and my experience
was identical).
https://lore.kernel.org/linux-acpi/20221010141630.zfzi7mk7zvnmclzy@techsing…
And after applying the first two, the kernel will not compile anymore
on non ACPI platform, so you need a third one.
The first two commits:
commit 4919d3eb2ec0ee364f7e3cf2d99646c1b224fae8
Author: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Date: Wed Oct 12 20:07:01 2022 +0200
rtc: cmos: Fix event handler registration ordering issue
commit 0782b66ed2fbb035dda76111df0954515e417b24
Author: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Date: Tue Oct 18 18:09:31 2022 +0200
rtc: cmos: Fix wake alarm breakage
And the third one:
commit db4e955ae333567dea02822624106c0b96a2f84f
Author: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Date: Tue Oct 18 22:35:11 2022 +0200
rtc: cmos: fix build on non-ACPI platforms
Cheers,
--
Mathieu Chouquet-Stringer me(a)mathieu.digital
The sun itself sees not till heaven clears.
-- William Shakespeare --
Function pointer ki_complete() expects 'long' as its second
argument, but we pass integer from ffs_user_copy_worker. This
might cause a CFI failure, as ki_complete is an indirect call
with mismatched prototype. Fix this by typecasting the second
argument to long.
Cc: <stable(a)vger.kernel.org> # 5.15
Signed-off-by: Prashanth K <quic_prashk(a)quicinc.com>
---
drivers/usb/gadget/function/f_fs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 73dc10a7..9c26561 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -835,7 +835,7 @@ static void ffs_user_copy_worker(struct work_struct *work)
kthread_unuse_mm(io_data->mm);
}
- io_data->kiocb->ki_complete(io_data->kiocb, ret);
+ io_data->kiocb->ki_complete(io_data->kiocb, (long)ret);
if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd)
eventfd_signal(io_data->ffs->ffs_eventfd, 1);
--
2.7.4
In case runtime services are not supported or have been disabled the
runtime services workqueue will never have been allocated.
Do not try to destroy the workqueue unconditionally in the unlikely
event that EFI initialisation fails to avoid dereferencing a NULL
pointer.
Fixes: 98086df8b70c ("efi: add missed destroy_workqueue when efisubsys_init fails")
Cc: stable(a)vger.kernel.org
Cc: Li Heng <liheng40(a)huawei.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/firmware/efi/efi.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 09716eebe8ac..a2b0cbc8741c 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -394,8 +394,8 @@ static int __init efisubsys_init(void)
efi_kobj = kobject_create_and_add("efi", firmware_kobj);
if (!efi_kobj) {
pr_err("efi: Firmware registration failed.\n");
- destroy_workqueue(efi_rts_wq);
- return -ENOMEM;
+ error = -ENOMEM;
+ goto err_destroy_wq;
}
if (efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE |
@@ -443,7 +443,10 @@ static int __init efisubsys_init(void)
err_put:
kobject_put(efi_kobj);
efi_kobj = NULL;
- destroy_workqueue(efi_rts_wq);
+err_destroy_wq:
+ if (efi_rts_wq)
+ destroy_workqueue(efi_rts_wq);
+
return error;
}
--
2.37.4
Congratulation,
We will like to inform you that your e-mail address has won the sum of
£400.000.00
from monthly British National Lottery Promotion held this 5 th October
2022. Your e-mail address was chosen from this promotion as one of
the lucky e-mail address through our
computer ballot system in British national lottery.
OPEN THE ATTACHED FILE FOR YOUR CLAIM.
From: Chris Wilson <chris(a)chris-wilson.co.uk>
After applying an engine reset, on some platforms like Jasperlake, we
occasionally detect that the engine state is not cleared until shortly
after the resume. As we try to resume the engine with volatile internal
state, the first request fails with a spurious CS event (it looks like
it reports a lite-restore to the hung context, instead of the expected
idle->active context switch).
Signed-off-by: Chris Wilson <hris(a)chris-wilson.co.uk>
Cc: stable(a)vger.kernel.org
Cc: Mika Kuoppala <mika.kuoppala(a)linux.intel.com>
Signed-off-by: Andi Shyti <andi.shyti(a)linux.intel.com>
---
drivers/gpu/drm/i915/gt/intel_reset.c | 34 ++++++++++++++++++++++-----
1 file changed, 28 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index ffde89c5835a4..88dfc0c5316ff 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -268,6 +268,7 @@ static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
{
struct intel_uncore *uncore = gt->uncore;
+ int loops = 2;
int err;
/*
@@ -275,18 +276,39 @@ static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
* for fifo space for the write or forcewake the chip for
* the read
*/
- intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
+ do {
+ intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
- /* Wait for the device to ack the reset requests */
- err = __intel_wait_for_register_fw(uncore,
- GEN6_GDRST, hw_domain_mask, 0,
- 500, 0,
- NULL);
+ /*
+ * Wait for the device to ack the reset requests.
+ *
+ * On some platforms, e.g. Jasperlake, we see see that the
+ * engine register state is not cleared until shortly after
+ * GDRST reports completion, causing a failure as we try
+ * to immediately resume while the internal state is still
+ * in flux. If we immediately repeat the reset, the second
+ * reset appears to serialise with the first, and since
+ * it is a no-op, the registers should retain their reset
+ * value. However, there is still a concern that upon
+ * leaving the second reset, the internal engine state
+ * is still in flux and not ready for resuming.
+ */
+ err = __intel_wait_for_register_fw(uncore, GEN6_GDRST,
+ hw_domain_mask, 0,
+ 2000, 0,
+ NULL);
+ } while (err == 0 && --loops);
if (err)
GT_TRACE(gt,
"Wait for 0x%08x engines reset failed\n",
hw_domain_mask);
+ /*
+ * As we have observed that the engine state is still volatile
+ * after GDRST is acked, impose a small delay to let everything settle.
+ */
+ udelay(50);
+
return err;
}
--
2.38.1
The list of rules on what kind of patches are accepted, and which ones
are not into the “-stable” tree, did not mention anything about new
features and let the reader use its own judgement. One may be under the
impression that new features are not accepted at all, but that's not true:
new features are not accepted unless they fix a reported problem.
Update documentation with missing rule.
Link: https://lore.kernel.org/lkml/fc60e8da-1187-ca2b-1aa8-28e01ea2769a@linaro.or…
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)linaro.org>
---
Documentation/process/stable-kernel-rules.rst | 1 +
1 file changed, 1 insertion(+)
diff --git a/Documentation/process/stable-kernel-rules.rst b/Documentation/process/stable-kernel-rules.rst
index 2fd8aa593a28..266290fab1d9 100644
--- a/Documentation/process/stable-kernel-rules.rst
+++ b/Documentation/process/stable-kernel-rules.rst
@@ -22,6 +22,7 @@ Rules on what kind of patches are accepted, and which ones are not, into the
maintainer and include an addendum linking to a bugzilla entry if it
exists and additional information on the user-visible impact.
- New device IDs and quirks are also accepted.
+ - New features are not accepted unless they fix a reported problem.
- No "theoretical race condition" issues, unless an explanation of how the
race can be exploited is also provided.
- It cannot contain any "trivial" fixes in it (spelling changes,
--
2.34.1
We have to update the uffd-wp SWP PTE bit independent of the type of
migration entry. Currently, if we're unlucky and we want to install/clear
the uffd-wp bit just while we're migrating a read-only mapped hugetlb page,
we would miss to set/clear the uffd-wp bit.
Further, if we're processing a readable-exclusive
migration entry and neither want to set or clear the uffd-wp bit, we
could currently end up losing the uffd-wp bit. Note that the same would
hold for writable migrating entries, however, having a writable
migration entry with the uffd-wp bit set would already mean that
something went wrong.
Note that the change from !is_readable_migration_entry ->
writable_migration_entry is harmless and actually cleaner, as raised by
Miaohe Lin and discussed in [1].
[1] https://lkml.kernel.org/r/90dd6a93-4500-e0de-2bf0-bf522c311b0c@huawei.com
Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
mm/hugetlb.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a94f519304f..9552a6d1a281 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6516,10 +6516,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
struct page *page = pfn_swap_entry_to_page(entry);
+ pte_t newpte = pte;
- if (!is_readable_migration_entry(entry)) {
- pte_t newpte;
-
+ if (is_writable_migration_entry(entry)) {
if (PageAnon(page))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
@@ -6527,13 +6526,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
entry = make_readable_migration_entry(
swp_offset(entry));
newpte = swp_entry_to_pte(entry);
- if (uffd_wp)
- newpte = pte_swp_mkuffd_wp(newpte);
- else if (uffd_wp_resolve)
- newpte = pte_swp_clear_uffd_wp(newpte);
- set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(pte, newpte))
+ set_huge_pte_at(mm, address, ptep, newpte);
} else if (unlikely(is_pte_marker(pte))) {
/* No other markers apply for now. */
WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
--
2.38.1
There are two problematic cases when stumbling over a PTE marker in
hugetlb_change_protection():
(1) We protect an uffd-wp PTE marker a second time using uffd-wp: we will
end up in the "!huge_pte_none(pte)" case and mess up the PTE marker.
(2) We unprotect a uffd-wp PTE marker: we will similarly end up in the
"!huge_pte_none(pte)" case even though we cleared the PTE, because
the "pte" variable is stale. We'll mess up the PTE marker.
For example, if we later stumble over such a "wrongly modified" PTE marker,
we'll treat it like a present PTE that maps some garbage page.
This can, for example, be triggered by mapping a memfd backed by huge
pages, registering uffd-wp, uffd-wp'ing an unmapped page and (a)
uffd-wp'ing it a second time; or (b) uffd-unprotecting it; or (c)
unregistering uffd-wp. Then, ff we trigger fallocate(FALLOC_FL_PUNCH_HOLE)
on that file range, we will run into a VM_BUG_ON:
[ 195.039560] page:00000000ba1f2987 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x0
[ 195.039565] flags: 0x7ffffc0001000(reserved|node=0|zone=0|lastcpupid=0x1fffff)
[ 195.039568] raw: 0007ffffc0001000 ffffe742c0000008 ffffe742c0000008 0000000000000000
[ 195.039569] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
[ 195.039569] page dumped because: VM_BUG_ON_PAGE(compound && !PageHead(page))
[ 195.039573] ------------[ cut here ]------------
[ 195.039574] kernel BUG at mm/rmap.c:1346!
[ 195.039579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[ 195.039581] CPU: 7 PID: 4777 Comm: qemu-system-x86 Not tainted 6.0.12-200.fc36.x86_64 #1
[ 195.039583] Hardware name: LENOVO 20WNS1F81N/20WNS1F81N, BIOS N35ET50W (1.50 ) 09/15/2022
[ 195.039584] RIP: 0010:page_remove_rmap+0x45b/0x550
[ 195.039588] Code: [...]
[ 195.039589] RSP: 0018:ffffbc03c3633ba8 EFLAGS: 00010292
[ 195.039591] RAX: 0000000000000040 RBX: ffffe742c0000000 RCX: 0000000000000000
[ 195.039592] RDX: 0000000000000002 RSI: ffffffff8e7aac1a RDI: 00000000ffffffff
[ 195.039592] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffbc03c3633a08
[ 195.039593] R10: 0000000000000003 R11: ffffffff8f146328 R12: ffff9b04c42754b0
[ 195.039594] R13: ffffffff8fcc6328 R14: ffffbc03c3633c80 R15: ffff9b0484ab9100
[ 195.039595] FS: 00007fc7aaf68640(0000) GS:ffff9b0bbf7c0000(0000) knlGS:0000000000000000
[ 195.039596] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 195.039597] CR2: 000055d402c49110 CR3: 0000000159392003 CR4: 0000000000772ee0
[ 195.039598] PKRU: 55555554
[ 195.039599] Call Trace:
[ 195.039600] <TASK>
[ 195.039602] __unmap_hugepage_range+0x33b/0x7d0
[ 195.039605] unmap_hugepage_range+0x55/0x70
[ 195.039608] hugetlb_vmdelete_list+0x77/0xa0
[ 195.039611] hugetlbfs_fallocate+0x410/0x550
[ 195.039612] ? _raw_spin_unlock_irqrestore+0x23/0x40
[ 195.039616] vfs_fallocate+0x12e/0x360
[ 195.039618] __x64_sys_fallocate+0x40/0x70
[ 195.039620] do_syscall_64+0x58/0x80
[ 195.039623] ? syscall_exit_to_user_mode+0x17/0x40
[ 195.039624] ? do_syscall_64+0x67/0x80
[ 195.039626] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 195.039628] RIP: 0033:0x7fc7b590651f
[ 195.039653] Code: [...]
[ 195.039654] RSP: 002b:00007fc7aaf66e70 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 195.039655] RAX: ffffffffffffffda RBX: 0000558ef4b7f370 RCX: 00007fc7b590651f
[ 195.039656] RDX: 0000000018000000 RSI: 0000000000000003 RDI: 000000000000000c
[ 195.039657] RBP: 0000000008000000 R08: 0000000000000000 R09: 0000000000000073
[ 195.039658] R10: 0000000008000000 R11: 0000000000000293 R12: 0000000018000000
[ 195.039658] R13: 00007fb8bbe00000 R14: 000000000000000c R15: 0000000000001000
[ 195.039661] </TASK>
Fix it by not going into the "!huge_pte_none(pte)" case if we stumble
over an exclusive marker. spin_unlock() + continue would get the job
done.
However, instead, make it clearer that there are no fall-through
statements: we process each case (hwpoison, migration, marker, !none, none)
and then unlock the page table to continue with the next PTE. Let's
avoid "continue" statements and use a single spin_unlock() at the end.
Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
mm/hugetlb.c | 21 +++++++--------------
1 file changed, 7 insertions(+), 14 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 77f36e3681e3..3a94f519304f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6512,10 +6512,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(is_hugetlb_entry_migration(pte))) {
+ /* Nothing to do. */
+ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
struct page *page = pfn_swap_entry_to_page(entry);
@@ -6536,18 +6534,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(pte_marker_uffd_wp(pte))) {
- /*
- * This is changing a non-present pte into a none pte,
- * no need for huge_ptep_modify_prot_start/commit().
- */
+ } else if (unlikely(is_pte_marker(pte))) {
+ /* No other markers apply for now. */
+ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
if (uffd_wp_resolve)
+ /* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize);
- }
- if (!huge_pte_none(pte)) {
+ } else if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
--
2.38.1
Since commit 07ec77a1d4e8 ("sched: Allow task CPU affinity to be
restricted on asymmetric systems"), the setting and clearing of
user_cpus_ptr are done under pi_lock for arm64 architecture. However,
dup_user_cpus_ptr() accesses user_cpus_ptr without any lock
protection. Since sched_setaffinity() can be invoked from another
process, the process being modified may be undergoing fork() at
the same time. When racing with the clearing of user_cpus_ptr in
__set_cpus_allowed_ptr_locked(), it can lead to user-after-free and
possibly double-free in arm64 kernel.
Commit 8f9ea86fdf99 ("sched: Always preserve the user requested
cpumask") fixes this problem as user_cpus_ptr, once set, will never
be cleared in a task's lifetime. However, this bug was re-introduced
in commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
do_set_cpus_allowed()") which allows the clearing of user_cpus_ptr in
do_set_cpus_allowed(). This time, it will affect all arches.
Fix this bug by always clearing the user_cpus_ptr of the newly
cloned/forked task before the copying process starts and check the
user_cpus_ptr state of the source task under pi_lock.
Note to stable, this patch won't be applicable to stable releases.
Just copy the new dup_user_cpus_ptr() function over.
Fixes: 07ec77a1d4e8 ("sched: Allow task CPU affinity to be restricted on asymmetric systems")
Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
CC: stable(a)vger.kernel.org
Reported-by: David Wang 王标 <wangbiao3(a)xiaomi.com>
Signed-off-by: Waiman Long <longman(a)redhat.com>
---
kernel/sched/core.c | 34 +++++++++++++++++++++++++++++-----
1 file changed, 29 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25b582b6ee5f..b93d030b9fd5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2612,19 +2612,43 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
+ cpumask_t *user_mask;
unsigned long flags;
- if (!src->user_cpus_ptr)
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
return 0;
- dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
- if (!dst->user_cpus_ptr)
+ user_mask = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ if (!user_mask)
return -ENOMEM;
- /* Use pi_lock to protect content of user_cpus_ptr */
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
raw_spin_lock_irqsave(&src->pi_lock, flags);
- cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
return 0;
}
--
2.31.1
The patch titled
Subject: mm/khugepaged: fix collapse_pte_mapped_thp() to allow anon_vma
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-khugepaged-fix-collapse_pte_mapped_thp-to-allow-anon_vma.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/khugepaged: fix collapse_pte_mapped_thp() to allow anon_vma
Date: Thu, 22 Dec 2022 12:41:50 -0800 (PST)
uprobe_write_opcode() uses collapse_pte_mapped_thp() to restore huge pmd,
when removing a breakpoint from hugepage text: vma->anon_vma is always set
in that case, so undo the prohibition. And MADV_COLLAPSE ought to be able
to collapse some page tables in a vma which happens to have anon_vma set
from CoWing elsewhere.
Is anon_vma lock required? Almost not: if any page other than expected
subpage of the non-anon huge page is found in the page table, collapse is
aborted without making any change. However, it is possible that an anon
page was CoWed from this extent in another mm or vma, in which case a
concurrent lookup might look here: so keep it away while clearing pmd (but
perhaps we shall go back to using pmd_lock() there in future).
Note that collapse_pte_mapped_thp() is exceptional in freeing a page table
without having cleared its ptes: I'm uneasy about that, and had thought
pte_clear()ing appropriate; but exclusive i_mmap lock does fix the
problem, and we would have to move the mmu_notification if clearing those
ptes.
What this fixes is not a dangerous instability. But I suggest Cc stable
because uprobes "healing" has regressed in that way, so this should follow
8d3c106e19e8 into those stable releases where it was backported (and may
want adjustment there - I'll supply backports as needed).
Link: https://lkml.kernel.org/r/b740c9fb-edba-92ba-59fb-7a5592e5dfc@google.com
Fixes: 8d3c106e19e8 ("mm/khugepaged: take the right locks for page table retraction")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Zach O'Keefe <zokeefe(a)google.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: <stable(a)vger.kernel.org> [5.4+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/mm/khugepaged.c~mm-khugepaged-fix-collapse_pte_mapped_thp-to-allow-anon_vma
+++ a/mm/khugepaged.c
@@ -1460,14 +1460,6 @@ int collapse_pte_mapped_thp(struct mm_st
if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
return SCAN_VMA_CHECK;
- /*
- * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings
- * that got written to. Without this, we'd have to also lock the
- * anon_vma if one exists.
- */
- if (vma->anon_vma)
- return SCAN_VMA_CHECK;
-
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
return SCAN_PTE_UFFD_WP;
@@ -1567,8 +1559,14 @@ int collapse_pte_mapped_thp(struct mm_st
}
/* step 4: remove pte entries */
+ /* we make no change to anon, but protect concurrent anon page lookup */
+ if (vma->anon_vma)
+ anon_vma_lock_write(vma->anon_vma);
+
collapse_and_free_pmd(mm, vma, haddr, pmd);
+ if (vma->anon_vma)
+ anon_vma_unlock_write(vma->anon_vma);
i_mmap_unlock_write(vma->vm_file->f_mapping);
maybe_install_pmd:
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-khugepaged-fix-collapse_pte_mapped_thp-to-allow-anon_vma.patch
The patch titled
Subject: mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection()
Date: Thu, 22 Dec 2022 21:55:11 +0100
We have to update the uffd-wp SWP PTE bit independent of the type of
migration entry. Currently, if we're unlucky and we want to install/clear
the uffd-wp bit just while we're migrating a read-only mapped hugetlb
page, we would miss to set/clear the uffd-wp bit.
Further, if we're processing a readable-exclusive migration entry and
neither want to set or clear the uffd-wp bit, we could currently end up
losing the uffd-wp bit. Note that the same would hold for writable
migrating entries, however, having a writable migration entry with the
uffd-wp bit set would already mean that something went wrong.
Note that the change from !is_readable_migration_entry ->
writable_migration_entry is harmless and actually cleaner, as raised by
Miaohe Lin and discussed in [1].
[1] https://lkml.kernel.org/r/90dd6a93-4500-e0de-2bf0-bf522c311b0c@huawei.com
Link: https://lkml.kernel.org/r/20221222205511.675832-3-david@redhat.com
Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/mm/hugetlb.c~mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection
+++ a/mm/hugetlb.c
@@ -6662,10 +6662,9 @@ unsigned long hugetlb_change_protection(
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
struct page *page = pfn_swap_entry_to_page(entry);
+ pte_t newpte = pte;
- if (!is_readable_migration_entry(entry)) {
- pte_t newpte;
-
+ if (is_writable_migration_entry(entry)) {
if (PageAnon(page))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
@@ -6673,13 +6672,15 @@ unsigned long hugetlb_change_protection(
entry = make_readable_migration_entry(
swp_offset(entry));
newpte = swp_entry_to_pte(entry);
- if (uffd_wp)
- newpte = pte_swp_mkuffd_wp(newpte);
- else if (uffd_wp_resolve)
- newpte = pte_swp_clear_uffd_wp(newpte);
- set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
+
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
+ if (!pte_same(pte, newpte))
+ set_huge_pte_at(mm, address, ptep, newpte);
} else if (unlikely(is_pte_marker(pte))) {
/* No other markers apply for now. */
WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
_
Patches currently in -mm which might be from david(a)redhat.com are
mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch
mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch
The patch titled
Subject: mm/hugetlb: fix PTE marker handling in hugetlb_change_protection()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: David Hildenbrand <david(a)redhat.com>
Subject: mm/hugetlb: fix PTE marker handling in hugetlb_change_protection()
Date: Thu, 22 Dec 2022 21:55:10 +0100
Patch series "mm/hugetlb: uffd-wp fixes for hugetlb_change_protection()".
Playing with virtio-mem and background snapshots (using uffd-wp) on
hugetlb in QEMU, I managed to trigger a VM_BUG_ON(). Looking into the
details, hugetlb_change_protection() seems to not handle uffd-wp correctly
in all cases.
Patch #1 fixes my test case. I don't have reproducers for patch #2, as it
requires running into migration entries.
I did not yet check in detail yet if !hugetlb code requires similar care.
This patch (of 2):
There are two problematic cases when stumbling over a PTE marker in
hugetlb_change_protection():
(1) We protect an uffd-wp PTE marker a second time using uffd-wp: we will
end up in the "!huge_pte_none(pte)" case and mess up the PTE marker.
(2) We unprotect a uffd-wp PTE marker: we will similarly end up in the
"!huge_pte_none(pte)" case even though we cleared the PTE, because
the "pte" variable is stale. We'll mess up the PTE marker.
For example, if we later stumble over such a "wrongly modified" PTE marker,
we'll treat it like a present PTE that maps some garbage page.
This can, for example, be triggered by mapping a memfd backed by huge
pages, registering uffd-wp, uffd-wp'ing an unmapped page and (a)
uffd-wp'ing it a second time; or (b) uffd-unprotecting it; or (c)
unregistering uffd-wp. Then, ff we trigger fallocate(FALLOC_FL_PUNCH_HOLE)
on that file range, we will run into a VM_BUG_ON:
[ 195.039560] page:00000000ba1f2987 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x0
[ 195.039565] flags: 0x7ffffc0001000(reserved|node=0|zone=0|lastcpupid=0x1fffff)
[ 195.039568] raw: 0007ffffc0001000 ffffe742c0000008 ffffe742c0000008 0000000000000000
[ 195.039569] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
[ 195.039569] page dumped because: VM_BUG_ON_PAGE(compound && !PageHead(page))
[ 195.039573] ------------[ cut here ]------------
[ 195.039574] kernel BUG at mm/rmap.c:1346!
[ 195.039579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[ 195.039581] CPU: 7 PID: 4777 Comm: qemu-system-x86 Not tainted 6.0.12-200.fc36.x86_64 #1
[ 195.039583] Hardware name: LENOVO 20WNS1F81N/20WNS1F81N, BIOS N35ET50W (1.50 ) 09/15/2022
[ 195.039584] RIP: 0010:page_remove_rmap+0x45b/0x550
[ 195.039588] Code: [...]
[ 195.039589] RSP: 0018:ffffbc03c3633ba8 EFLAGS: 00010292
[ 195.039591] RAX: 0000000000000040 RBX: ffffe742c0000000 RCX: 0000000000000000
[ 195.039592] RDX: 0000000000000002 RSI: ffffffff8e7aac1a RDI: 00000000ffffffff
[ 195.039592] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffbc03c3633a08
[ 195.039593] R10: 0000000000000003 R11: ffffffff8f146328 R12: ffff9b04c42754b0
[ 195.039594] R13: ffffffff8fcc6328 R14: ffffbc03c3633c80 R15: ffff9b0484ab9100
[ 195.039595] FS: 00007fc7aaf68640(0000) GS:ffff9b0bbf7c0000(0000) knlGS:0000000000000000
[ 195.039596] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 195.039597] CR2: 000055d402c49110 CR3: 0000000159392003 CR4: 0000000000772ee0
[ 195.039598] PKRU: 55555554
[ 195.039599] Call Trace:
[ 195.039600] <TASK>
[ 195.039602] __unmap_hugepage_range+0x33b/0x7d0
[ 195.039605] unmap_hugepage_range+0x55/0x70
[ 195.039608] hugetlb_vmdelete_list+0x77/0xa0
[ 195.039611] hugetlbfs_fallocate+0x410/0x550
[ 195.039612] ? _raw_spin_unlock_irqrestore+0x23/0x40
[ 195.039616] vfs_fallocate+0x12e/0x360
[ 195.039618] __x64_sys_fallocate+0x40/0x70
[ 195.039620] do_syscall_64+0x58/0x80
[ 195.039623] ? syscall_exit_to_user_mode+0x17/0x40
[ 195.039624] ? do_syscall_64+0x67/0x80
[ 195.039626] entry_SYSCALL_64_after_hwframe+0x63/0xcd
[ 195.039628] RIP: 0033:0x7fc7b590651f
[ 195.039653] Code: [...]
[ 195.039654] RSP: 002b:00007fc7aaf66e70 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 195.039655] RAX: ffffffffffffffda RBX: 0000558ef4b7f370 RCX: 00007fc7b590651f
[ 195.039656] RDX: 0000000018000000 RSI: 0000000000000003 RDI: 000000000000000c
[ 195.039657] RBP: 0000000008000000 R08: 0000000000000000 R09: 0000000000000073
[ 195.039658] R10: 0000000008000000 R11: 0000000000000293 R12: 0000000018000000
[ 195.039658] R13: 00007fb8bbe00000 R14: 000000000000000c R15: 0000000000001000
[ 195.039661] </TASK>
Fix it by not going into the "!huge_pte_none(pte)" case if we stumble over
an exclusive marker. spin_unlock() + continue would get the job done.
However, instead, make it clearer that there are no fall-through
statements: we process each case (hwpoison, migration, marker, !none,
none) and then unlock the page table to continue with the next PTE. Let's
avoid "continue" statements and use a single spin_unlock() at the end.
Link: https://lkml.kernel.org/r/20221222205511.675832-1-david@redhat.com
Link: https://lkml.kernel.org/r/20221222205511.675832-2-david@redhat.com
Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/mm/hugetlb.c~mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection
+++ a/mm/hugetlb.c
@@ -6658,10 +6658,8 @@ unsigned long hugetlb_change_protection(
}
pte = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(is_hugetlb_entry_migration(pte))) {
+ /* Nothing to do. */
+ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
struct page *page = pfn_swap_entry_to_page(entry);
@@ -6682,18 +6680,13 @@ unsigned long hugetlb_change_protection(
set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
- spin_unlock(ptl);
- continue;
- }
- if (unlikely(pte_marker_uffd_wp(pte))) {
- /*
- * This is changing a non-present pte into a none pte,
- * no need for huge_ptep_modify_prot_start/commit().
- */
+ } else if (unlikely(is_pte_marker(pte))) {
+ /* No other markers apply for now. */
+ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
if (uffd_wp_resolve)
+ /* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize);
- }
- if (!huge_pte_none(pte)) {
+ } else if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
_
Patches currently in -mm which might be from david(a)redhat.com are
mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch
mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch
Since commit 07ec77a1d4e8 ("sched: Allow task CPU affinity to be
restricted on asymmetric systems"), the setting and clearing of
user_cpus_ptr are done under pi_lock for arm64 architecture. However,
dup_user_cpus_ptr() accesses user_cpus_ptr without any lock
protection. When racing with the clearing of user_cpus_ptr in
__set_cpus_allowed_ptr_locked(), it can lead to user-after-free and
double-free in arm64 kernel.
Commit 8f9ea86fdf99 ("sched: Always preserve the user requested
cpumask") fixes this problem as user_cpus_ptr, once set, will never
be cleared in a task's lifetime. However, this bug was re-introduced
in commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
do_set_cpus_allowed()") which allows the clearing of user_cpus_ptr in
do_set_cpus_allowed(). This time, it will affect all arches.
Fix this bug by always clearing the user_cpus_ptr of the newly
cloned/forked task before the copying process starts and check the
user_cpus_ptr state of the source task under pi_lock.
Note to stable, this patch won't be applicable to stable releases.
Just copy the new dup_user_cpus_ptr() function over.
Fixes: 07ec77a1d4e8 ("sched: Allow task CPU affinity to be restricted on asymmetric systems")
Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
CC: stable(a)vger.kernel.org
Reported-by: David Wang 王标 <wangbiao3(a)xiaomi.com>
Signed-off-by: Waiman Long <longman(a)redhat.com>
---
kernel/sched/core.c | 34 +++++++++++++++++++++++++++++-----
1 file changed, 29 insertions(+), 5 deletions(-)
[v2: Use data_race() macro as suggested by Will]
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b2d5cabcc5..57e5932f81a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2612,19 +2612,43 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
+ cpumask_t *user_mask;
unsigned long flags;
- if (!src->user_cpus_ptr)
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
return 0;
- dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
- if (!dst->user_cpus_ptr)
+ user_mask = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ if (!user_mask)
return -ENOMEM;
- /* Use pi_lock to protect content of user_cpus_ptr */
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
raw_spin_lock_irqsave(&src->pi_lock, flags);
- cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
return 0;
}
--
2.31.1