When bfqq is shared by multiple processes it can happen that one of the
processes gets moved to a different cgroup (or just starts submitting IO
for different cgroup). In case that happens we need to split the merged
bfqq as otherwise we will have IO for multiple cgroups in one bfqq and
we will just account IO time to wrong entities etc.
Similarly if the bfqq is scheduled to merge with another bfqq but the
merge didn't happen yet, cancel the merge as it need not be valid
anymore.
CC: stable(a)vger.kernel.org
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-cgroup.c | 36 +++++++++++++++++++++++++++++++++---
block/bfq-iosched.c | 2 +-
block/bfq-iosched.h | 1 +
3 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 420eda2589c0..9352f3cc2377 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -743,9 +743,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
}
if (sync_bfqq) {
- entity = &sync_bfqq->entity;
- if (entity->sched_data != &bfqg->sched_data)
- bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
+ /* We are the only user of this bfqq, just move it */
+ if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ } else {
+ struct bfq_queue *bfqq;
+
+ /*
+ * The queue was merged to a different queue. Check
+ * that the merge chain still belongs to the same
+ * cgroup.
+ */
+ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
+ if (bfqq->entity.sched_data !=
+ &bfqg->sched_data)
+ break;
+ if (bfqq) {
+ /*
+ * Some queue changed cgroup so the merge is
+ * not valid anymore. We cannot easily just
+ * cancel the merge (by clearing new_bfqq) as
+ * there may be other processes using this
+ * queue and holding refs to all queues below
+ * sync_bfqq->new_bfqq. Similarly if the merge
+ * already happened, we need to detach from
+ * bfqq now so that we cannot merge bio to a
+ * request from the old cgroup.
+ */
+ bfq_put_cooperator(sync_bfqq);
+ bfq_release_process_ref(bfqd, sync_bfqq);
+ bic_set_bfqq(bic, NULL, 1);
+ }
+ }
}
return bfqg;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 7d00b21ebe5d..89fe3f85eb3c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5315,7 +5315,7 @@ static void bfq_put_stable_ref(struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
+void bfq_put_cooperator(struct bfq_queue *bfqq)
{
struct bfq_queue *__bfqq, *next;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 3b83e3d1c2e5..a56763045d19 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -979,6 +979,7 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_put_cooperator(struct bfq_queue *bfqq);
void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_schedule_dispatch(struct bfq_data *bfqd);
--
2.34.1
This is the start of the stable review cycle for the 4.9.328 release.
There are 42 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu, 15 Sep 2022 14:03:27 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.9.328-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.9.328-rc1
NeilBrown <neilb(a)suse.de>
SUNRPC: use _bh spinlocking on ->transport_lock
Yang Ling <gnaygnil(a)gmail.com>
MIPS: loongson32: ls1c: Fix hang during startup
Johan Hovold <johan+linaro(a)kernel.org>
usb: dwc3: fix PHY disable sequence
Toke Høiland-Jørgensen <toke(a)toke.dk>
sch_sfb: Also store skb len before calling child enqueue
Neal Cardwell <ncardwell(a)google.com>
tcp: fix early ETIMEDOUT after spurious non-SACK RTO
Dan Carpenter <dan.carpenter(a)oracle.com>
tipc: fix shift wrapping bug in map_get()
Toke Høiland-Jørgensen <toke(a)toke.dk>
sch_sfb: Don't assume the skb is still around after enqueueing to child
David Leadbeater <dgl(a)dgl.cx>
netfilter: nf_conntrack_irc: Fix forged IP logic
Harsh Modi <harshmodi(a)google.com>
netfilter: br_netfilter: Drop dst references before setting.
Isaac J. Manjarres <isaacmanjarres(a)google.com>
driver core: Don't probe devices after bus_type.match() probe deferral
Sreekanth Reddy <sreekanth.reddy(a)broadcom.com>
scsi: mpt3sas: Fix use-after-free warning
Dongxiang Ke <kdx.glider(a)gmail.com>
ALSA: usb-audio: Fix an out-of-bounds bug in __snd_usb_parse_audio_interface()
Pattara Teerapong <pteerapong(a)chromium.org>
ALSA: aloop: Fix random zeros in capture data when using jiffies timer
Tasos Sahanidis <tasos(a)tasossah.com>
ALSA: emu10k1: Fix out of bounds access in snd_emu10k1_pcm_channel_alloc()
Yang Yingliang <yangyingliang(a)huawei.com>
fbdev: chipsfb: Add missing pci_disable_device() in chipsfb_pci_init()
Helge Deller <deller(a)gmx.de>
parisc: Add runtime check to prevent PA2.0 kernels on PA1.x machines
Li Qiong <liqiong(a)nfschina.com>
parisc: ccio-dma: Handle kmalloc failure in ccio_init_resources()
Zhenneng Li <lizhenneng(a)kylinos.cn>
drm/radeon: add a force flush to delay work when radeon
Yee Lee <yee.lee(a)mediatek.com>
Revert "mm: kmemleak: take a full lowmem check in kmemleak_*_phys()"
Linus Torvalds <torvalds(a)linux-foundation.org>
fs: only do a memory barrier for the first set_buffer_uptodate()
Takashi Iwai <tiwai(a)suse.de>
ALSA: seq: Fix data-race at module auto-loading
Takashi Iwai <tiwai(a)suse.de>
ALSA: seq: oss: Fix data-race for max_midi_devs access
Miquel Raynal <miquel.raynal(a)bootlin.com>
net: mac802154: Fix a condition in the receive path
Siddh Raman Pant <code(a)siddh.me>
wifi: mac80211: Don't finalize CSA in IBSS mode if state is disconnected
Krishna Kurapati <quic_kriskura(a)quicinc.com>
usb: gadget: mass_storage: Fix cdrom data transfers on MAC-OS
Alan Stern <stern(a)rowland.harvard.edu>
USB: core: Prevent nested device-reset calls
Josh Poimboeuf <jpoimboe(a)kernel.org>
s390: fix nospec table alignments
Gerald Schaefer <gerald.schaefer(a)linux.ibm.com>
s390/hugetlb: fix prepare_hugepage_range() check for 2 GB hugepages
Witold Lipieta <witold.lipieta(a)thaumatec.com>
usb-storage: Add ignore-residue quirk for NXP PN7462AU
Thierry GUIBERT <thierry.guibert(a)croix-rouge.fr>
USB: cdc-acm: Add Icom PMR F3400 support (0c26:0020)
Slark Xiao <slark_xiao(a)163.com>
USB: serial: option: add support for Cinterion MV32-WA/WB RmNet mode
Yan Xinyu <sdlyyxy(a)bupt.edu.cn>
USB: serial: option: add support for OPPO R11 diag port
Johan Hovold <johan(a)kernel.org>
USB: serial: cp210x: add Decagon UCA device id
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: Add grace period after xHC start to prevent premature runtime suspend.
Armin Wolf <W_Armin(a)gmx.de>
hwmon: (gpio-fan) Fix array out of bounds access
Niek Nooijens <niek.nooijens(a)omron.com>
USB: serial: ftdi_sio: add Omron CS1W-CIF31 device id
Helge Deller <deller(a)gmx.de>
vt: Clear selection before changing the font
Dan Carpenter <dan.carpenter(a)oracle.com>
staging: rtl8712: fix use after free bugs
Shenwei Wang <shenwei.wang(a)nxp.com>
serial: fsl_lpuart: RS485 RTS polariy is inverse
Dan Carpenter <dan.carpenter(a)oracle.com>
wifi: cfg80211: debugfs: fix return type in ht40allow_map_read()
Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
platform/x86: pmc_atom: Fix SLP_TYPx bitfield mask
Letu Ren <fantasquex(a)gmail.com>
fbdev: fb_pm2fb: Avoid potential divide by zero error
-------------
Diffstat:
Makefile | 4 +--
arch/mips/loongson32/ls1c/board.c | 1 -
arch/parisc/kernel/head.S | 43 +++++++++++++++++++++++++++-
arch/s390/include/asm/hugetlb.h | 6 ++--
arch/s390/kernel/vmlinux.lds.S | 1 +
arch/x86/include/asm/pmc_atom.h | 6 ++--
arch/x86/platform/atom/pmc_atom.c | 2 +-
drivers/base/dd.c | 10 +++++++
drivers/gpu/drm/radeon/radeon_device.c | 3 ++
drivers/hwmon/gpio-fan.c | 3 ++
drivers/parisc/ccio-dma.c | 11 +++++--
drivers/scsi/mpt3sas/mpt3sas_scsih.c | 2 +-
drivers/staging/rtl8712/rtl8712_cmd.c | 36 -----------------------
drivers/tty/serial/fsl_lpuart.c | 4 +--
drivers/tty/vt/vt.c | 12 +++++---
drivers/usb/class/cdc-acm.c | 3 ++
drivers/usb/core/hub.c | 10 +++++++
drivers/usb/dwc3/core.c | 20 ++++++-------
drivers/usb/gadget/function/storage_common.c | 6 ++--
drivers/usb/host/xhci-hub.c | 11 +++++++
drivers/usb/host/xhci.c | 4 ++-
drivers/usb/host/xhci.h | 2 +-
drivers/usb/serial/cp210x.c | 1 +
drivers/usb/serial/ftdi_sio.c | 2 ++
drivers/usb/serial/ftdi_sio_ids.h | 6 ++++
drivers/usb/serial/option.c | 11 +++++++
drivers/usb/storage/unusual_devs.h | 7 +++++
drivers/video/fbdev/chipsfb.c | 1 +
drivers/video/fbdev/pm2fb.c | 5 ++++
include/linux/buffer_head.h | 11 +++++++
include/linux/usb.h | 2 ++
mm/kmemleak.c | 8 +++---
net/bridge/br_netfilter_hooks.c | 2 ++
net/bridge/br_netfilter_ipv6.c | 1 +
net/ipv4/tcp_input.c | 25 +++++++++++-----
net/mac80211/ibss.c | 4 +++
net/mac802154/rx.c | 2 +-
net/netfilter/nf_conntrack_irc.c | 5 ++--
net/sched/sch_sfb.c | 13 +++++----
net/sunrpc/xprt.c | 4 +--
net/tipc/monitor.c | 2 +-
net/wireless/debugfs.c | 3 +-
sound/core/seq/oss/seq_oss_midi.c | 2 ++
sound/core/seq/seq_clientmgr.c | 12 ++++----
sound/drivers/aloop.c | 7 +++--
sound/pci/emu10k1/emupcm.c | 2 +-
sound/usb/stream.c | 2 +-
47 files changed, 236 insertions(+), 104 deletions(-)
Syzbot reported an issue with ext4 extents. The reproducer creates
a corrupted ext4 fs image in memory, and mounts it as a loop device.
It invokes the ext4_cache_extents() and ext4_find_extent(), which
eventually triggers a BUG() in ext4_es_end() causing a kernel crash.
It triggers on mainline, and every kernel version back to v4.14.
Add a call ext4_ext_check_inode() in ext4_find_extent() to prevent
the crash.
To: "Theodore Ts'o" <tytso(a)mit.edu>
Cc: "Andreas Dilger" <adilger.kernel(a)dilger.ca>
Cc: <linux-ext4(a)vger.kernel.org>
Cc: <linux-kernel(a)vger.kernel.org>
Cc: <stable(a)vger.kernel.org>
Link: https://syzkaller.appspot.com/bug?id=641e7a4b900015c5d7a729d6cc1fba7a928a88…
Reported-by: syzbot+a22dc4b0744ac658ed9b(a)syzkaller.appspotmail.com
Signed-off-by: Tadeusz Struk <tadeusz.struk(a)linaro.org>
---
fs/ext4/extents.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5235974126bd..c7b5a11e1abc 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -897,6 +897,12 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
goto err;
}
+ ret = ext4_ext_check_inode(inode);
+ if (ret) {
+ EXT4_ERROR_INODE(inode, "inode has invalid extent");
+ goto err;
+ }
+
if (path) {
ext4_ext_drop_refs(path);
if (depth > path[0].p_maxdepth) {
--
2.37.3
From: Alexander Sverdlin <alexander.sverdlin(a)nokia.com>
Erase can be zeroed in spi_nor_parse_4bait() or
spi_nor_init_non_uniform_erase_map(). In practice it happened with
mt25qu256a, which supports 4K, 32K, 64K erases with 3b address commands,
but only 4K and 64K erase with 4b address commands.
Fixes: dc92843159a7 ("mtd: spi-nor: fix erase_type array to indicate current map conf")
Cc: stable(a)vger.kernel.org
Signed-off-by: Alexander Sverdlin <alexander.sverdlin(a)nokia.com>
---
Changes in v2:
erase->opcode -> erase->size
drivers/mtd/spi-nor/core.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 88dd090..183ea9d 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -1400,6 +1400,8 @@ spi_nor_find_best_erase_type(const struct spi_nor_erase_map *map,
continue;
erase = &map->erase_type[i];
+ if (!erase->size)
+ continue;
/* Alignment is not mandatory for overlaid regions */
if (region->offset & SNOR_OVERLAID_REGION &&
--
2.10.2
The second (UID) strcmp in acpi_dev_hid_uid_match considers
"0" and "00" different, which can prevent device registration.
Have the AMD IOMMU driver's ivrs_acpihid parsing code remove
any leading zeroes to make the UID strcmp succeed. Now users
can safely specify "AMDxxxxx:00" or "AMDxxxxx:0" and expect
the same behaviour.
Fixes: ca3bf5d47cec ("iommu/amd: Introduces ivrs_acpihid kernel parameter")
Signed-off-by: Kim Phillips <kim.phillips(a)amd.com>
Cc: stable(a)vger.kernel.org
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit(a)amd.com>
Cc: Joerg Roedel <jroedel(a)suse.de>
---
v2: no changes
drivers/iommu/amd/init.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index fdc642362c14..ef0e1a4b5a11 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3471,6 +3471,13 @@ static int __init parse_ivrs_acpihid(char *str)
return 1;
}
+ /*
+ * Ignore leading zeroes after ':', so e.g., AMDI0095:00
+ * will match AMDI0095:0 in the second strcmp in acpi_dev_hid_uid_match
+ */
+ while (*uid == '0' && *(uid + 1))
+ uid++;
+
i = early_acpihid_map_size++;
memcpy(early_acpihid_map[i].hid, hid, strlen(hid));
memcpy(early_acpihid_map[i].uid, uid, strlen(uid));
--
2.34.1
It is a bit unlcear to us why that's helping, but it does and unbreaks
suspend/resume on a lot of GPUs without any known drawbacks.
Cc: stable(a)vger.kernel.org # v5.15+
Closes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/156
Signed-off-by: Karol Herbst <kherbst(a)redhat.com>
---
drivers/gpu/drm/nouveau/nouveau_bo.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 35bb0bb3fe61..126b3c6e12f9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -822,6 +822,15 @@ nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict,
if (ret == 0) {
ret = nouveau_fence_new(chan, false, &fence);
if (ret == 0) {
+ /* TODO: figure out a better solution here
+ *
+ * wait on the fence here explicitly as going through
+ * ttm_bo_move_accel_cleanup somehow doesn't seem to do it.
+ *
+ * Without this the operation can timeout and we'll fallback to a
+ * software copy, which might take several minutes to finish.
+ */
+ nouveau_fence_wait(fence, false, false);
ret = ttm_bo_move_accel_cleanup(bo,
&fence->base,
evict, false,
--
2.37.1
Hello,
I was profiling the 5.10 kernel and comparing it to 4.14. On a system with 64 virtual CPUs and 256 GiB of RAM, I am observing a significant drop in IO performance. Using the following FIO with the script "sudo ftest_write.sh <dev_name>" in attachment, I saw FIO iops result drop from 22K to less than 1K.
The script simply does: mount a the EXT4 16GiB volume with max IOPS 64000K, mounting option is " -o noatime,nodiratime,data=ordered", then run fio with 2048 fio wring thread with 28800000 file size with { --name=16kb_rand_write_only_2048_jobs --directory=/rdsdbdata1 --rw=randwrite --ioengine=sync --buffered=1 --bs=16k --max-jobs=2048 --numjobs=2048 --runtime=60 --time_based --thread --filesize=28800000 --fsync=1 --group_reporting }.
My analyzing is that the degradation is introduce by commit {244adf6426ee31a83f397b700d964cff12a247d3} and the issue is the contention on rsv_conversion_wq. The simplest option is to increase the journal size, but that introduces more operational complexity. Another option is to add the following change in attachment "allow more ext4-rsv-conversion workqueue.patch"
From 27e1b0e14275a281b3529f6a60c7b23a81356751 Mon Sep 17 00:00:00 2001
From: davinalu <davinalu(a)amazon.com>
Date: Fri, 23 Sep 2022 00:43:53 +0000
Subject: [PATCH] allow more ext4-rsv-conversion workqueue to speedup fio writing
---
fs/ext4/super.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a0af833f7da7..6b34298cdc3b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4963,7 +4963,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* concurrency isn't really necessary. Limit it to 1.
*/
EXT4_SB(sb)->rsv_conversion_wq =
- alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM |
+ WQ_UNBOUND | __WQ_ORDERED, 0);
if (!EXT4_SB(sb)->rsv_conversion_wq) {
printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
ret = -ENOMEM;
My thought is: If the max_active is 1, it means the "__WQ_ORDERED" combined with WQ_UNBOUND setting, based on alloc_workqueue(). So I added it .
I am not sure should we need "__WQ_ORDERED" or not? without "__WQ_ORDERED" it looks also work at my testbed, but I added since not much fio TP difference on my testbed result with/out "__WQ_ORDERED".
From My understanding and observation: with dioread_unlock and delay_alloc both enabled, the bio_endio() and ext4_writepages() will trigger this work queue to ext4_do_flush_completed_IO(). Looks like the work queue is an one-by-one updating: at EXT4 extend.c io_end->list_vec list only have one io_end_vec each time. So if the BIO has high performance, and we have only one thread to do EXT4 flush will be an bottleneck here. The "ext4-rsv-conversion" this workqueue is mainly for update the EXT4_IO_END_UNWRITTEN extend block(only exist on dioread_unlock and delay_alloc options are set) and extend status if I understand correctly here. Am I correct?
This works on my test system and passes xfstests, but will this cause any corruption on ext4 extends blocks updates, not even sure about the journal transaction updates either?
Can you tell me what I will break if this change is made?
Thanks
Davina
commit 573ae4f13f630d6660008f1974c0a8a29c30e18a upstream.
With special lengths supplied by user space, tee_shm_register() has
an integer overflow when calculating the number of pages covered by a
supplied user space memory region.
This may cause pin_user_pages_fast() to do a NULL pointer dereference.
Fix this by adding an an explicit call to access_ok() in
tee_ioctl_shm_register() to catch an invalid user space address early.
Fixes: 033ddf12bcf5 ("tee: add register user memory")
Cc: stable(a)vger.kernel.org # 5.4
Cc: stable(a)vger.kernel.org # 5.10
Reported-by: Nimish Mishra <neelam.nimish(a)gmail.com>
Reported-by: Anirban Chakraborty <ch.anirban00727(a)gmail.com>
Reported-by: Debdeep Mukhopadhyay <debdeep.mukhopadhyay(a)gmail.com>
Suggested-by: Jerome Forissier <jerome.forissier(a)linaro.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
[JW: backport to stable 5.4 and 5.10 + update commit message]
Signed-off-by: Jens Wiklander <jens.wiklander(a)linaro.org>
---
drivers/tee/tee_core.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index a7ccd4d2bd10..2db144d2d26f 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -182,6 +182,9 @@ tee_ioctl_shm_register(struct tee_context *ctx,
if (data.flags)
return -EINVAL;
+ if (!access_ok((void __user *)(unsigned long)data.addr, data.length))
+ return -EFAULT;
+
shm = tee_shm_register(ctx, data.addr, data.length,
TEE_SHM_DMA_BUF | TEE_SHM_USER_MAPPED);
if (IS_ERR(shm))
--
2.31.1
If the starting position of our insert range happens to be in the hole
between the two ext4_extent_idx, because the lblk of the ext4_extent in
the previous ext4_extent_idx is always less than the start, which leads
to the "extent" variable access across the boundary, the following UAF is
triggered:
==================================================================
BUG: KASAN: use-after-free in ext4_ext_shift_extents+0x257/0x790
Read of size 4 at addr ffff88819807a008 by task fallocate/8010
CPU: 3 PID: 8010 Comm: fallocate Tainted: G E 5.10.0+ #492
Call Trace:
dump_stack+0x7d/0xa3
print_address_description.constprop.0+0x1e/0x220
kasan_report.cold+0x67/0x7f
ext4_ext_shift_extents+0x257/0x790
ext4_insert_range+0x5b6/0x700
ext4_fallocate+0x39e/0x3d0
vfs_fallocate+0x26f/0x470
ksys_fallocate+0x3a/0x70
__x64_sys_fallocate+0x4f/0x60
do_syscall_64+0x33/0x40
entry_SYSCALL_64_after_hwframe+0x44/0xa9
==================================================================
For right shifts, we can divide them into the following situations:
1. When the first ee_block of ext4_extent_idx is greater than or equal to
start, make right shifts directly from the first ee_block.
1) If it is greater than start, we need to continue searching in the
previous ext4_extent_idx.
2) If it is equal to start, we can exit the loop (iterator=NULL).
2. When the first ee_block of ext4_extent_idx is less than start, then
traverse from the last extent to find the first extent whose ee_block
is less than start.
1) If extent is still the last extent after traversal, it means that
the last ee_block of ext4_extent_idx is less than start, that is,
start is located in the hole between idx and (idx+1), so we can
exit the loop directly (break) without right shifts.
2) Otherwise, make right shifts at the corresponding position of the
found extent, and then exit the loop (iterator=NULL).
Fixes: 331573febb6a ("ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate")
Cc: stable(a)vger.kernel.org # v4.2+
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
V1->V2:
Initialize "ret" after the "again:" label to avoid return value mismatch.
Refactoring reduces cycles and makes code more readable.
fs/ext4/extents.c | 18 +++++++++++++-----
1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c148bb97b527..39c9f87de0be 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5179,6 +5179,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
* and it is decreased till we reach start.
*/
again:
+ ret = 0;
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
@@ -5222,14 +5223,21 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
ext4_ext_get_actual_len(extent);
} else {
extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
- if (le32_to_cpu(extent->ee_block) > 0)
+ if (le32_to_cpu(extent->ee_block) > start)
*iterator = le32_to_cpu(extent->ee_block) - 1;
- else
- /* Beginning is reached, end of the loop */
+ else if (le32_to_cpu(extent->ee_block) == start)
iterator = NULL;
- /* Update path extent in case we need to stop */
- while (le32_to_cpu(extent->ee_block) < start)
+ else {
+ extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+ while (le32_to_cpu(extent->ee_block) >= start)
+ extent--;
+
+ if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
+ break;
+
extent++;
+ iterator = NULL;
+ }
path[depth].p_ext = extent;
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
--
2.31.1