The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x 8802766324e1f5d414a81ac43365c20142e85603
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021855-snugly-hacked-a8fa@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8802766324e1f5d414a81ac43365c20142e85603 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Wed, 12 Feb 2025 13:46:46 +0000
Subject: [PATCH] io_uring/kbuf: reallocate buf lists on upgrade
IORING_REGISTER_PBUF_RING can reuse an old struct io_buffer_list if it
was created for legacy selected buffer and has been emptied. It violates
the requirement that most of the field should stay stable after publish.
Always reallocate it instead.
Cc: stable(a)vger.kernel.org
Reported-by: Pumpkin Chang <pumpkin(a)devco.re>
Fixes: 2fcabce2d7d34 ("io_uring: disallow mixed provided buffer group registrations")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 04bf493eecae..8e72de7712ac 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -415,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
}
}
+static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ scoped_guard(mutex, &ctx->mmap_lock)
+ WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl);
+ io_put_bl(ctx, bl);
+}
+
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
@@ -636,12 +643,13 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
return -EEXIST;
- } else {
- free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
+ io_destroy_bl(ctx, bl);
}
+ free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return -ENOMEM;
+
mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
ring_size = flex_array_size(br, bufs, reg.ring_entries);
From: Shu Han <ebpqwerty472123(a)gmail.com>
commit ea7e2d5e49c05e5db1922387b09ca74aa40f46e2 upstream.
The remap_file_pages syscall handler calls do_mmap() directly, which
doesn't contain the LSM security check. And if the process has called
personality(READ_IMPLIES_EXEC) before and remap_file_pages() is called for
RW pages, this will actually result in remapping the pages to RWX,
bypassing a W^X policy enforced by SELinux.
So we should check prot by security_mmap_file LSM hook in the
remap_file_pages syscall handler before do_mmap() is called. Otherwise, it
potentially permits an attacker to bypass a W^X policy enforced by
SELinux.
The bypass is similar to CVE-2016-10044, which bypass the same thing via
AIO and can be found in [1].
The PoC:
$ cat > test.c
int main(void) {
size_t pagesz = sysconf(_SC_PAGE_SIZE);
int mfd = syscall(SYS_memfd_create, "test", 0);
const char *buf = mmap(NULL, 4 * pagesz, PROT_READ | PROT_WRITE,
MAP_SHARED, mfd, 0);
unsigned int old = syscall(SYS_personality, 0xffffffff);
syscall(SYS_personality, READ_IMPLIES_EXEC | old);
syscall(SYS_remap_file_pages, buf, pagesz, 0, 2, 0);
syscall(SYS_personality, old);
// show the RWX page exists even if W^X policy is enforced
int fd = open("/proc/self/maps", O_RDONLY);
unsigned char buf2[1024];
while (1) {
int ret = read(fd, buf2, 1024);
if (ret <= 0) break;
write(1, buf2, ret);
}
close(fd);
}
$ gcc test.c -o test
$ ./test | grep rwx
7f1836c34000-7f1836c35000 rwxs 00002000 00:01 2050 /memfd:test (deleted)
Link: https://project-zero.issues.chromium.org/issues/42452389 [1]
Cc: stable(a)vger.kernel.org
Signed-off-by: Shu Han <ebpqwerty472123(a)gmail.com>
Acked-by: Stephen Smalley <stephen.smalley.work(a)gmail.com>
[PM: subject line tweaks]
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
Signed-off-by: Pratyush Yadav <ptyadav(a)amazon.de>
---
mm/mmap.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9f76625a1743..2c17eb840e44 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3078,8 +3078,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
file = get_file(vma->vm_file);
+ ret = security_mmap_file(vma->vm_file, prot, flags);
+ if (ret)
+ goto out_fput;
ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
+out_fput:
fput(file);
out:
mmap_write_unlock(mm);
--
2.47.1
[BUG]
When running generic/417 with a btrfs whose block size < page size
(subpage cases), it always fails.
And the following minimal reproducer is more than enough to trigger it
reliably:
workload()
{
mkfs.btrfs -s 4k -f $dev > /dev/null
dmesg -C
mount $dev $mnt
$fsstree_dir/src/dio-invalidate-cache -r -b 4096 -n 3 -i 1 -f $mnt/diotest
ret=$?
umount $mnt
stop_trace
if [ $ret -ne 0 ]; then
fail
fi
}
for (( i = 0; i < 1024; i++)); do
echo "=== $i/$runtime ==="
workload
done
[CAUSE]
With extra trace printk added to the following functions:
- btrfs_buffered_write()
* Which folio is touched
* The file offset (start) where the buffered write is at
* How many bytes are copied
* The content of the write (the first 2 bytes)
- submit_one_sector()
* Which folio is touched
* The position inside the folio
- pagecache_isize_extended()
* The parameters of the function itself
* The parameters of the folio_zero_range()
Which are enough to show the problem:
22.158114: btrfs_buffered_write: folio pos=0 start=0 copied=4096 content=0x0101
22.158161: submit_one_sector: r/i=5/257 folio=0 pos=0 content=0x0101
22.158609: btrfs_buffered_write: folio pos=0 start=4096 copied=4096 content=0x0101
22.158634: btrfs_buffered_write: folio pos=0 start=8192 copied=4096 content=0x0101
22.158650: pagecache_isize_extended: folio=0 from=4096 to=8192 bsize=4096 zero off=4096 len=8192
22.158682: submit_one_sector: r/i=5/257 folio=0 pos=4096 content=0x0000
22.158686: submit_one_sector: r/i=5/257 folio=0 pos=8192 content=0x0101
The tool dio-invalidate-cache will start 3 threads, each doing a buffered
write with 0x01 at 4096 * i (i is 0, 1 ,2), do a fsync, then do a direct read,
and compare the read buffer with the write buffer.
Note that all 3 btrfs_buffered_write() are writing the correct 0x01 into
the page cache.
But at submit_one_sector(), at file offset 4096, the content is zeroed
out, mostly by pagecache_isize_extended().
The race happens like this:
Thread A is writing into range [4K, 8K).
Thread B is writing into range [8K, 12k).
Thread A | Thread B
-------------------------------------+------------------------------------
btrfs_buffered_write() | btrfs_buffered_write()
|- old_isize = 4K; | |- old_isize = 4096;
|- btrfs_inode_lock() | |
|- write into folio range [4K, 8K) | |
|- pagecache_isize_extended() | |
| extend isize from 4096 to 8192 | |
| no folio_zero_range() called | |
|- btrfs_inode_lock() | |
| |- btrfs_inode_lock()
| |- write into folio range [8K, 12K)
| |- pagecache_isize_extended()
| | calling folio_zero_range(4K, 8K)
| | This is caused by the old_isize is
| | grabbed too early, without any
| | inode lock.
| |- btrfs_inode_unlock()
The @old_isize is grabbed without inode lock, causing race between two
buffered write threads and making pagecache_isize_extended() to zero
range which is still containing cached data.
And this is only affecting subpage btrfs, because for regular blocksize
== page size case, the function pagecache_isize_extended() will do
nothing if the block size >= page size.
[FIX]
Grab the old isize with inode lock hold.
This means each buffered write thread will have a stable view of the
old inode size, thus avoid the above race.
Cc: stable(a)vger.kernel.org
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/file.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fd90855fe717..896dc03689d6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1090,7 +1090,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
u64 lockend;
size_t num_written = 0;
ssize_t ret;
- loff_t old_isize = i_size_read(inode);
+ loff_t old_isize;
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
@@ -1103,6 +1103,13 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret < 0)
return ret;
+ /*
+ * We can only trust the isize with inode lock hold, or it can race with
+ * other buffered writes and cause incorrect call of
+ * pagecache_isize_extended() to overwrite existing data.
+ */
+ old_isize = i_size_read(inode);
+
ret = generic_write_checks(iocb, i);
if (ret <= 0)
goto out;
--
2.48.1
We've had instances of drivers returning invalid values from gpio_chip
calbacks. In several cases these return values would be propagated to
user-space and confuse programs that only expect 0 or negative errnos
from ioctl()s. Let's sanitize the return values of callbacks and make
sure we don't allow anyone see invalid ones.
The first patch checks the return values of get_direction() in kernel
where needed and is a backportable fix.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
---
Bartosz Golaszewski (8):
gpiolib: check the return value of gpio_chip::get_direction()
gpiolib: sanitize the return value of gpio_chip::request()
gpiolib: sanitize the return value of gpio_chip::set_config()
gpiolib: sanitize the return value of gpio_chip::get()
gpiolib: sanitize the return value of gpio_chip::get_multiple()
gpiolib: sanitize the return value of gpio_chip::direction_output()
gpiolib: sanitize the return value of gpio_chip::direction_input()
gpiolib: sanitize the return value of gpio_chip::get_direction()
drivers/gpio/gpiolib.c | 144 +++++++++++++++++++++++++++++++++++---------
include/linux/gpio/driver.h | 6 +-
2 files changed, 120 insertions(+), 30 deletions(-)
---
base-commit: a13f6e0f405ed0d3bcfd37c692c7d7fa3c052154
change-id: 20241212-gpio-sanitize-retvals-f5f4e0d6f57d
Best regards,
--
Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
According to the chip manual, the I2C register access type of
Loongson-2K2000/LS7A is "B", so we can only access registers in byte
form (readb/writeb).
Although Loongson-2K0500/Loongson-2K1000 do not have similar
constraints, register accesses in byte form also behave correctly.
Also, in hardware, the frequency division registers are defined as two
separate registers (high 8-bit and low 8-bit), so we just access them
directly as bytes.
Cc: stable(a)vger.kernel.org
Fixes: 015e61f0bffd ("i2c: ls2x: Add driver for Loongson-2K/LS7A I2C controller")
Co-developed-by: Hongliang Wang <wanghongliang(a)loongson.cn>
Signed-off-by: Hongliang Wang <wanghongliang(a)loongson.cn>
Signed-off-by: Binbin Zhou <zhoubinbin(a)loongson.cn>
---
V2:
- Add a comment to prevent from changing that back to 16-bit write.
Link to V1:
https://lore.kernel.org/all/20250218111133.3058590-1-zhoubinbin@loongson.cn/
drivers/i2c/busses/i2c-ls2x.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/drivers/i2c/busses/i2c-ls2x.c b/drivers/i2c/busses/i2c-ls2x.c
index 8821cac3897b..61377693a4d6 100644
--- a/drivers/i2c/busses/i2c-ls2x.c
+++ b/drivers/i2c/busses/i2c-ls2x.c
@@ -10,6 +10,7 @@
* Rewritten for mainline by Binbin Zhou <zhoubinbin(a)loongson.cn>
*/
+#include <linux/bitfield.h>
#include <linux/bits.h>
#include <linux/completion.h>
#include <linux/device.h>
@@ -26,7 +27,8 @@
#include <linux/units.h>
/* I2C Registers */
-#define I2C_LS2X_PRER 0x0 /* Freq Division Register(16 bits) */
+#define I2C_LS2X_PRER_LO 0x0 /* Freq Division Low Byte Register */
+#define I2C_LS2X_PRER_HI 0x1 /* Freq Division High Byte Register */
#define I2C_LS2X_CTR 0x2 /* Control Register */
#define I2C_LS2X_TXR 0x3 /* Transport Data Register */
#define I2C_LS2X_RXR 0x3 /* Receive Data Register */
@@ -93,6 +95,7 @@ static irqreturn_t ls2x_i2c_isr(int this_irq, void *dev_id)
*/
static void ls2x_i2c_adjust_bus_speed(struct ls2x_i2c_priv *priv)
{
+ u16 val;
struct i2c_timings *t = &priv->i2c_t;
struct device *dev = priv->adapter.dev.parent;
u32 acpi_speed = i2c_acpi_find_bus_speed(dev);
@@ -104,9 +107,14 @@ static void ls2x_i2c_adjust_bus_speed(struct ls2x_i2c_priv *priv)
else
t->bus_freq_hz = LS2X_I2C_FREQ_STD;
- /* Calculate and set i2c frequency. */
- writew(LS2X_I2C_PCLK_FREQ / (5 * t->bus_freq_hz) - 1,
- priv->base + I2C_LS2X_PRER);
+ /*
+ * According to the chip manual, we can only access the registers as bytes,
+ * otherwise the high bits will be truncated.
+ * So set the I2C frequency with a sequential writeb instead of writew.
+ */
+ val = LS2X_I2C_PCLK_FREQ / (5 * t->bus_freq_hz) - 1;
+ writeb(FIELD_GET(GENMASK(7, 0), val), priv->base + I2C_LS2X_PRER_LO);
+ writeb(FIELD_GET(GENMASK(15, 8), val), priv->base + I2C_LS2X_PRER_HI);
}
static void ls2x_i2c_init(struct ls2x_i2c_priv *priv)
base-commit: 7e45b505e699f4c80aa8bf79b4ea2a5f5a66bb51
--
2.47.1
On the arm64 platform with 4K base page config, SECTION_SIZE_BITS is set
to 27, making one section 128M. The related page struct which vmemmap
points to is 2M then.
Commit c1cc1552616d ("arm64: MMU initialisation") optimizes the
vmemmap to populate at the PMD section level which was suitable
initially since hot plug granule is always one section(128M). However,
commit ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug")
introduced a 2M(SUBSECTION_SIZE) hot plug granule, which disrupted the
existing arm64 assumptions.
The first problem is that if start or end is not aligned to a section
boundary, such as when a subsection is hot added, populating the entire
section is wasteful.
The next problem is if we hotplug something that spans part of 128 MiB
section (subsections, let's call it memblock1), and then hotplug something
that spans another part of a 128 MiB section(subsections, let's call it
memblock2), and subsequently unplug memblock1, vmemmap_free() will clear
the entire PMD entry which also supports memblock2 even though memblock2
is still active.
Assuming hotplug/unplug sizes are guaranteed to be symmetric. Do the
fix similar to x86-64: populate to pages levels if start/end is not aligned
with section boundary.
Cc: <stable(a)vger.kernel.org> # v5.4+
Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug")
Signed-off-by: Zhenhua Huang <quic_zhenhuah(a)quicinc.com>
---
Hi Catalin and David,
Following our latest discussion, I've updated the patch for your review.
I also removed Catalin's review tag since I've made significant modifications.
arch/arm64/mm/mmu.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index b4df5bc5b1b8..de05ccf47f21 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1177,8 +1177,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
struct vmem_altmap *altmap)
{
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
+ /* [start, end] should be within one section */
+ WARN_ON(end - start > PAGES_PER_SECTION * sizeof(struct page));
- if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES))
+ if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) ||
+ (end - start < PAGES_PER_SECTION * sizeof(struct page)))
return vmemmap_populate_basepages(start, end, node, altmap);
else
return vmemmap_populate_hugepages(start, end, node, altmap);
--
2.25.1
From: Ge Yang <yangge1116(a)126.com>
Since the introduction of commit c77c0a8ac4c52 ("mm/hugetlb: defer freeing
of huge pages if in non-task context"), which supports deferring the
freeing of hugetlb pages, the allocation of contiguous memory through
cma_alloc() may fail probabilistically.
In the CMA allocation process, if it is found that the CMA area is occupied
by in-use hugetlb folios, these in-use hugetlb folios need to be migrated
to another location. When there are no available hugetlb folios in the
free hugetlb pool during the migration of in-use hugetlb folios, new folios
are allocated from the buddy system. A temporary state is set on the newly
allocated folio. Upon completion of the hugetlb folio migration, the
temporary state is transferred from the new folios to the old folios.
Normally, when the old folios with the temporary state are freed, it is
directly released back to the buddy system. However, due to the deferred
freeing of hugetlb pages, the PageBuddy() check fails, ultimately leading
to the failure of cma_alloc().
Here is a simplified call trace illustrating the process:
cma_alloc()
->__alloc_contig_migrate_range() // Migrate in-use hugetlb folios
->unmap_and_move_huge_page()
->folio_putback_hugetlb() // Free old folios
->test_pages_isolated()
->__test_page_isolated_in_pageblock()
->PageBuddy(page) // Check if the page is in buddy
To resolve this issue, we have implemented a function named
wait_for_freed_hugetlb_folios(). This function ensures that the hugetlb
folios are properly released back to the buddy system after their migration
is completed. By invoking wait_for_freed_hugetlb_folios() before calling
PageBuddy(), we ensure that PageBuddy() will succeed.
Fixes: c77c0a8ac4c52 ("mm/hugetlb: defer freeing of huge pages if in non-task context")
Signed-off-by: Ge Yang <yangge1116(a)126.com>
Cc: <stable(a)vger.kernel.org>
---
V4:
- add a check to determine if hpage_freelist is empty suggested by David
V3:
- adjust code and message suggested by Muchun and David
V2:
- flush all folios at once suggested by David
include/linux/hugetlb.h | 5 +++++
mm/hugetlb.c | 8 ++++++++
mm/page_isolation.c | 10 ++++++++++
3 files changed, 23 insertions(+)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6c6546b..0c54b3a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -697,6 +697,7 @@ bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
+void wait_for_freed_hugetlb_folios(void);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, bool cow_from_owner);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
@@ -1092,6 +1093,10 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn,
return 0;
}
+static inline void wait_for_freed_hugetlb_folios(void)
+{
+}
+
static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr,
bool cow_from_owner)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 30bc34d..8801dbc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2955,6 +2955,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
typedef enum {
/*
* For either 0/1: we checked the per-vma resv map, and one resv
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 8ed53ee0..b2fc526 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -615,6 +615,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
--
2.7.4