My local syzbot instance hit memory leak in usb_set_configuration().
The problem was in unputted usb interface. In case of errors after
usb_get_intf() the reference should be putted to correclty free memory
allocated for this interface.
Fixes: ec16dae5453e ("V4L/DVB (7019): V4L: add support for Syntek DC1125 webcams")
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Skripkin <paskripkin(a)gmail.com>
---
drivers/media/usb/stkwebcam/stk-webcam.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/media/usb/stkwebcam/stk-webcam.c b/drivers/media/usb/stkwebcam/stk-webcam.c
index a45d464427c4..0e231e576dc3 100644
--- a/drivers/media/usb/stkwebcam/stk-webcam.c
+++ b/drivers/media/usb/stkwebcam/stk-webcam.c
@@ -1346,7 +1346,7 @@ static int stk_camera_probe(struct usb_interface *interface,
if (!dev->isoc_ep) {
pr_err("Could not find isoc-in endpoint\n");
err = -ENODEV;
- goto error;
+ goto error_put;
}
dev->vsettings.palette = V4L2_PIX_FMT_RGB565;
dev->vsettings.mode = MODE_VGA;
@@ -1359,10 +1359,12 @@ static int stk_camera_probe(struct usb_interface *interface,
err = stk_register_video_device(dev);
if (err)
- goto error;
+ goto error_put;
return 0;
+error_put:
+ usb_put_intf(interface);
error:
v4l2_ctrl_handler_free(hdl);
v4l2_device_unregister(&dev->v4l2_dev);
--
2.32.0
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 5.4.130 kernel.
All users of the 5.4 kernel series must upgrade.
The updated 5.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
- ------------
Makefile | 2 +-
drivers/gpio/Kconfig | 2 ++
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++--
drivers/infiniband/hw/mlx5/flow.c | 8 ++++++++
drivers/scsi/sr.c | 2 ++
security/integrity/platform_certs/load_uefi.c | 5 -----
6 files changed, 15 insertions(+), 8 deletions(-)
Christian König (1):
drm/nouveau: fix dma_address check for CPU/GPU sync
Johannes Berg (1):
gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
ManYi Li (1):
scsi: sr: Return appropriate error code when disk is ejected
Mark Bloch (1):
RDMA/mlx5: Block FDB rules when not in switchdev mode
Sasha Levin (1):
Linux 5.4.130
YueHaibing (1):
x86/efi: remove unused variables
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDlopkACgkQ3qZv95d3
LNxXEg//cOzICk7j6/qzijKVxiuJC3YBJ99U05fap7EjShpomDrxs9q4yhbOeBAn
Q7GdF8XDa4si/32T0fAojLKLWMvGN6gwjIm1mKFPcSte2lLiO3KnlQubjdo2dwJi
VFtmkMd+Op+77WhzFBpEYRiFisnzg4376eQIp/rzxC64faVkbp8mobtq0KCTIvqv
OT05R1Z/DMR2pl5vI1ho592oVvl4FbWWZJRcTc05RAzdPpOsFqmJZkIua5Q0GQzo
iovxwsVz7cH7RMcXIQ65emuNFIFlB1qfKfA1VR/WPxuj/gmtOJth2mU0+BO5qSYK
9z1mbCUFjEDjgI2dBshNzlKIERoo6tExWiy6BOmuQyf5KTzJLpaLtLXvxygak6GB
FT3rGmcV85ec4wAgHeFgBeEJWyMgmgiNopx0intVkNwhx7Ulsg3aB8vrMwn+QJSN
2kPA/rip8VEignB2wzw0o/0yiAp2tjszcpwTFVIjo7JnfzjDyhzANdslEFu+yjUm
FomT+mXe/yGfoKHl8YqXYaVntu/YGbocd2pGg5337zwFimR88YfVpfAezTmnrg8L
8xI8q7v3b8vcXzxCbLE801tMuU1gKFj9m7/DAWp+F/ak9jV0M7lelnpGF3lr4DDP
3MU6J+Y4faiC2LT6KcRaBkoPXPXVdYjH9yYA+l8jwaAbDK3fFCE=
=60Aw
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 5.10.48 kernel.
All users of the 5.10 kernel series must upgrade.
The updated 5.10.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.10.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
- ------------
Makefile | 2 +-
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 1 +
drivers/gpio/Kconfig | 2 ++
drivers/gpio/gpio-mxc.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++--
drivers/infiniband/hw/mlx5/fs.c | 7 +++++++
drivers/scsi/sr.c | 2 ++
8 files changed, 17 insertions(+), 4 deletions(-)
Christian König (1):
drm/nouveau: fix dma_address check for CPU/GPU sync
Johannes Berg (1):
gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
Loic Poulain (1):
gpio: mxc: Fix disabled interrupt wake-up support
ManYi Li (1):
scsi: sr: Return appropriate error code when disk is ejected
Mark Bloch (1):
RDMA/mlx5: Block FDB rules when not in switchdev mode
Sasha Levin (1):
Linux 5.10.48
Sean Christopherson (1):
Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDloocACgkQ3qZv95d3
LNxeRQ/9HzdDMiAxylshAGkKq+cx9Wxk3LdksMUWwfskZKnCs5rwCWT3zSafGDdX
bePEtdVsFGBEnKc0SZ2O/kR0plG967vGUSllB27QYa/e+QTCQt2JIn4X8J+s64hj
XS1Z39+sXYy6YNjam2DgPrG9nEDJLNkxO7E/4hhcPhToDXKtn2Zk+ZBpp3qVPOJa
cW/OXRSm0dlDWvDs8ELOzLoPpI67BMXUI+GhROZOgPP97aADbC2Tucy8QtAJilFa
jb5a8vZhvYIzmnKLcKNc+bACpbPtNyRq4glO/AeGlQHCIyZe06D5+fGQNU5r5oCy
DaViYl6sx/TpHHqSBcTnMmhCIbo5KPYvyMiYKlhEieAQDbuQX7VZSVRo2IANOqm3
yvX3A7vRRF5Pbvg2dGhubLlhB9tsP67/j9AZ14B5IIjapQAoLct19F1W5C7ochHr
CNyUAmusfneXyLb9ZOW1GRiP1oSiOB5NZMF380ObSqxQUP0YoH7A/H9YMGiiScyl
5jZzJ/bbQzQIjz72eyVca88wEWtpJynQDRTqRRbm3OPZ3C6OSXsOIB0X7RRfM/EF
PMMS7hH5INPvpsMvurgyHsUtlDm8JBe3nQUchf3bhqPHsRVJqwfO9IpCjwym1/o9
g7gVvaSl5airLyVJf8k4Q8WMNywkD92g8xE9Al51Rk3dn4uNYJk=
=4Xvv
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 5.12.15 kernel.
All users of the 5.12 kernel series must upgrade.
The updated 5.12.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.12.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
- ------------
Makefile | 2 +-
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 1 +
drivers/gpio/Kconfig | 2 ++
drivers/gpio/gpio-mxc.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++--
drivers/s390/crypto/vfio_ap_ops.c | 10 ----------
drivers/scsi/sr.c | 2 ++
8 files changed, 10 insertions(+), 14 deletions(-)
Christian König (1):
drm/nouveau: fix dma_address check for CPU/GPU sync
Johannes Berg (1):
gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
Loic Poulain (1):
gpio: mxc: Fix disabled interrupt wake-up support
ManYi Li (1):
scsi: sr: Return appropriate error code when disk is ejected
Sasha Levin (1):
Linux 5.12.15
Sean Christopherson (1):
Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
Tony Krowiak (1):
s390/vfio-ap: clean up mdev resources when remove callback invoked
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDlonYACgkQ3qZv95d3
LNxTRQ/+KvXQgHRTDTcMYUU3hj6OkXbR2t/oz2lsWKdXFPqirN9TF5D0S7El1Xh7
PKhQmvozwKiJhfwE3J0QYmIkce5rm/DL2tcvrBHo4O1L883/xv5rd4St2JBHJ4ns
B3BgJ7VwdsxtVU8d2Rgogpxawwh4Tf1peSL08UwXgh3rF4554O9N6OuuSydFX552
Bd01LBkdmEhj0Wm1ma1WF5u86Pk/W8BYhtrUlcmpydAMnz3s5X459e8wSbKbP7Mp
vtPOx9MltRaqKRcyVH1Yb5Ryy6Q/TBKWtFfB9hnlMcU7VYAVCRxKxEFuMhia7ZES
DcmF8XRQx9+mnbvOlRXv0BMVSHbfbg9OjfZklCKJ6kTcyTODvCMx395UBVxNbiMB
pvc3XNUxx8bmO/LwzfP+7Bw74AQumpVMnS4aJkffaOPRxvep32NJ2QZUgU/nIs2Y
9EaToL+srZljcnCfR/KmCvpBbAuJp6cEcrd3SxhuSj42/1O/xa8tM+DflyoDUNXu
/EUbSt2ncGdcGy0DmLyIkV31YcWosdS9LIrqAjozbJXjaEdvfk9l0XxOvgb0udJF
MIEjrSmz7voMkd+8D1RjejzztcbCXNx8OhJNreOvCWaRrVSGv8+qtFOcKDAEKc4Q
dML5t7hGeusLY+KQ592vdGG0RAUpka6q/q48Pp70/jF0PS4GyhU=
=gaYx
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 5.13.1 kernel.
All users of the 5.13 kernel series must upgrade.
The updated 5.13.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.13.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
- ------------
Makefile | 2 +-
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 1 +
mm/page_alloc.c | 2 +-
4 files changed, 4 insertions(+), 2 deletions(-)
Mel Gorman (1):
mm/page_alloc: correct return value of populated elements if bulk array is populated
Sasha Levin (1):
Linux 5.13.1
Sean Christopherson (1):
Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDlolkACgkQ3qZv95d3
LNyHdw//Yv2YwPIPDRntmmSPZE2E0MwOZu+W2B+OoKJHJ8MJeYpMwoNwObwqoa7u
7oxHuiUgQ7iTBcvDdvwLCDAKZkIa6sm5WAb3x5qzmvQNYIAWsQrsHo42A1yhMLIT
YMuo7GQkn5E9WNs+aOGrI38Ea91Ckdd5GcwsKRcXuY30phvnzTEVoXKdLzFJFOWk
O/8nbDT1X+msLxndzwW/Vl4AAiaYbnQpzSyhFiswD4r0jYL5yg1jc2UaWUxSP+yj
mbgZ2QQAHh1dvE4rzPyl6tmFhVDg+yiT6UO1f7tont8FA4dUdJ4sJDW2QINKXRxf
RwEY6fsU9kcpH2IfZa25Fi2E+kesKEeT0a/uCGJOYmZZkXfPtnlNHE4FVlKdtlRh
q761ApqR5JGOXSE08iSxTMXJNsbyGxG4CYP8vbUhYm/wIJbOqBCF9DOkH7sr5MA4
TyWuG4uexfmE00dWfljuAma07Sd4F+mILsC/55O9lQt80aiSRUsuoFM6e7FQmBgj
g7uSEzm476z/4B+v1ewjPfT6gw4KGahV9a6EfSztm/JdvotFaImkjdYneGuSkmaF
56rt3ZF+7LD05mK9/nQd713F2hOD6yVzyIiWiSzO9250qRMb3SR0xjaUC9iKpuQb
/VkcKU+yMLbZnZwoBONedk/gfnKpvjzo/9EBBWOLyln0FWVuBdA=
=QsF8
-----END PGP SIGNATURE-----
This is the start of the stable review cycle for the 5.10.48 release.
There are 7 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed 07 Jul 2021 10:59:49 AM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.10.y
and the diffstat can be found below.
Thanks,
Sasha
-------------
Pseudo-Shortlog of commits:
Christian König (1):
drm/nouveau: fix dma_address check for CPU/GPU sync
Johannes Berg (1):
gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
Loic Poulain (1):
gpio: mxc: Fix disabled interrupt wake-up support
ManYi Li (1):
scsi: sr: Return appropriate error code when disk is ejected
Mark Bloch (1):
RDMA/mlx5: Block FDB rules when not in switchdev mode
Sasha Levin (1):
Linux 5.10.48-rc1
Sean Christopherson (1):
Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
Makefile | 4 ++--
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 1 +
drivers/gpio/Kconfig | 2 ++
drivers/gpio/gpio-mxc.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++--
drivers/infiniband/hw/mlx5/fs.c | 7 +++++++
drivers/scsi/sr.c | 2 ++
8 files changed, 18 insertions(+), 5 deletions(-)
--
2.30.2
This is the start of the stable review cycle for the 5.12.15 release.
There are 7 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed 07 Jul 2021 10:59:20 AM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.12.y
and the diffstat can be found below.
Thanks,
Sasha
-------------
Pseudo-Shortlog of commits:
Christian König (1):
drm/nouveau: fix dma_address check for CPU/GPU sync
Johannes Berg (1):
gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
Loic Poulain (1):
gpio: mxc: Fix disabled interrupt wake-up support
ManYi Li (1):
scsi: sr: Return appropriate error code when disk is ejected
Sasha Levin (1):
Linux 5.12.15-rc1
Sean Christopherson (1):
Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
Tony Krowiak (1):
s390/vfio-ap: clean up mdev resources when remove callback invoked
Makefile | 4 ++--
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 1 +
drivers/gpio/Kconfig | 2 ++
drivers/gpio/gpio-mxc.c | 2 +-
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++--
drivers/s390/crypto/vfio_ap_ops.c | 10 ----------
drivers/scsi/sr.c | 2 ++
8 files changed, 11 insertions(+), 15 deletions(-)
--
2.30.2
This is the start of the stable review cycle for the 5.13.1 release.
There are 2 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed 07 Jul 2021 10:49:46 AM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.13.y
and the diffstat can be found below.
Thanks,
Sasha
-------------
Pseudo-Shortlog of commits:
Sasha Levin (1):
Linux 5.13.1-rc1
Sean Christopherson (1):
Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack"
Makefile | 4 ++--
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu/mmu.c | 1 +
3 files changed, 4 insertions(+), 2 deletions(-)
--
2.30.2
We use the async_delalloc_pages mechanism to make sure that we've
completed our async work before trying to continue our delalloc
flushing. The reason for this is we need to see any ordered extents
that were created by our delalloc flushing. However we're waking up
before we do the submit work, which is before we create the ordered
extents. This is a pretty wide race window where we could potentially
think there are no ordered extents and thus exit shrink_delalloc
prematurely. Fix this by waking us up after we've done the work to
create ordered extents.
cc: stable(a)vger.kernel.org
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
fs/btrfs/inode.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b1f02e3fea5d..e388153c4ae4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1290,11 +1290,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1303,6 +1298,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
--
2.26.3
We have been hitting some early ENOSPC issues in production with more
recent kernels, and I tracked it down to us simply not flushing delalloc
as aggressively as we should be. With tracing I was seeing us failing
all tickets with all of the block rsvs at or around 0, with very little
pinned space, but still around 120MiB of outstanding bytes_may_used.
Upon further investigation I saw that we were flushing around 14 pages
per shrink call for delalloc, despite having around 2GiB of delalloc
outstanding.
Consider the example of a 8 way machine, all CPUs trying to create a
file in parallel, which at the time of this commit requires 5 items to
do. Assuming a 16k leaf size, we have 10MiB of total metadata reclaim
size waiting on reservations. Now assume we have 128MiB of delalloc
outstanding. With our current math we would set items to 20, and then
set to_reclaim to 20 * 256k, or 5MiB.
Assuming that we went through this loop all 3 times, for both
FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
twice, we'd only flush 60MiB of the 128MiB delalloc space. This could
leave a fair bit of delalloc reservations still hanging around by the
time we go to ENOSPC out all the remaining tickets.
Fix this two ways. First, change the calculations to be a fraction of
the total delalloc bytes on the system. Prior to this change we were
calculating based on dirty inodes so our math made more sense, now it's
just completely unrelated to what we're actually doing.
Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
gone through the flush states at least once. This will empty the system
of all delalloc so we're sure to be truly out of space when we start
failing tickets.
I'm tagging stable 5.10 and forward, because this is where we started
using the page stuff heavily again. This affects earlier kernel
versions as well, but would be a pain to backport to them as the
flushing mechanisms aren't the same.
CC: stable(a)vger.kernel.org # 5.10+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
fs/btrfs/ctree.h | 9 +++++----
fs/btrfs/space-info.c | 35 ++++++++++++++++++++++++++---------
include/trace/events/btrfs.h | 1 +
3 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d7ef4d7d2c1a..232ff1a49ca6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2783,10 +2783,11 @@ enum btrfs_flush_state {
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
- ALLOC_CHUNK = 7,
- ALLOC_CHUNK_FORCE = 8,
- RUN_DELAYED_IPUTS = 9,
- COMMIT_TRANS = 10,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index af161eb808a2..0c539a94c6d9 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -494,6 +494,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
long time_left;
int loops;
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+
/* Calc the number of the pages we need flush for space reservation */
if (to_reclaim == U64_MAX) {
items = U64_MAX;
@@ -501,19 +504,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
/*
* to_reclaim is set to however much metadata we need to
* reclaim, but reclaiming that much data doesn't really track
- * exactly, so increase the amount to reclaim by 2x in order to
- * make sure we're flushing enough delalloc to hopefully reclaim
- * some metadata reservations.
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
*/
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
if (delalloc_bytes == 0 && ordered_bytes == 0)
return;
@@ -596,8 +601,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT, for_preempt);
+ state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -907,6 +915,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
/*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
@@ -1070,7 +1086,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* so if we now have space to allocate do the force chunk allocation.
*/
static const enum btrfs_flush_state data_flush_states[] = {
- FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
@@ -1159,6 +1175,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 3d81ba8c37b9..ddf5c250726c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -94,6 +94,7 @@ struct btrfs_space_info;
EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \
EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \
EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \
+ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \
EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \
EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \
EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
--
2.26.3
This is the start of the stable review cycle for the 5.4.130 release.
There are 6 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed 07 Jul 2021 11:00:14 AM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.4.y
and the diffstat can be found below.
Thanks,
Sasha
-------------
Pseudo-Shortlog of commits:
Christian König (1):
drm/nouveau: fix dma_address check for CPU/GPU sync
Johannes Berg (1):
gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP
ManYi Li (1):
scsi: sr: Return appropriate error code when disk is ejected
Mark Bloch (1):
RDMA/mlx5: Block FDB rules when not in switchdev mode
Sasha Levin (1):
Linux 5.4.130-rc1
YueHaibing (1):
x86/efi: remove unused variables
Makefile | 4 ++--
drivers/gpio/Kconfig | 2 ++
drivers/gpu/drm/nouveau/nouveau_bo.c | 4 ++--
drivers/infiniband/hw/mlx5/flow.c | 8 ++++++++
drivers/scsi/sr.c | 2 ++
security/integrity/platform_certs/load_uefi.c | 5 -----
6 files changed, 16 insertions(+), 9 deletions(-)
--
2.30.2
The patch titled
Subject: ntfs: fix validity check for file name attribute
has been removed from the -mm tree. Its filename was
ntfs-fix-validity-check-for-file-name-attribute.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Desmond Cheong Zhi Xi <desmondcheongzx(a)gmail.com>
Subject: ntfs: fix validity check for file name attribute
When checking the file name attribute, we want to ensure that it fits
within the bounds of ATTR_RECORD. To do this, we should check that (attr
record + file name offset + file name length) < (attr record + attr record
length).
However, the original check did not include the file name offset in the
calculation. This means that corrupted on-disk metadata might not caught
by the incorrect file name check, and lead to an invalid memory access.
An example can be seen in the crash report of a memory corruption error
found by Syzbot:
https://syzkaller.appspot.com/bug?id=a1a1e379b225812688566745c3e2f7242bffc2…
Adding the file name offset to the validity check fixes this error and
passes the Syzbot reproducer test.
Link: https://lkml.kernel.org/r/20210614050540.289494-1-desmondcheongzx@gmail.com
Signed-off-by: Desmond Cheong Zhi Xi <desmondcheongzx(a)gmail.com>
Reported-by: syzbot+213ac8bb98f7f4420840(a)syzkaller.appspotmail.com
Tested-by: syzbot+213ac8bb98f7f4420840(a)syzkaller.appspotmail.com
Acked-by: Anton Altaparmakov <anton(a)tuxera.com>
Cc: Shuah Khan <skhan(a)linuxfoundation.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ntfs/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/fs/ntfs/inode.c~ntfs-fix-validity-check-for-file-name-attribute
+++ a/fs/ntfs/inode.c
@@ -477,7 +477,7 @@ err_corrupt_attr:
}
file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
le16_to_cpu(attr->data.resident.value_offset));
- p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length);
+ p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
if (p2 < (u8*)attr || p2 > p)
goto err_corrupt_attr;
/* This attribute is ok, but is it in the $Extend directory? */
_
Patches currently in -mm which might be from desmondcheongzx(a)gmail.com are
The patch titled
Subject: mm/page_alloc: correct return value of populated elements if bulk array is populated
has been removed from the -mm tree. Its filename was
mm-page_alloc-correct-return-value-of-populated-elements-if-bulk-array-is-populated.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Mel Gorman <mgorman(a)techsingularity.net>
Subject: mm/page_alloc: correct return value of populated elements if bulk array is populated
Dave Jones reported the following
This made it into 5.13 final, and completely breaks NFSD for me
(Serving tcp v3 mounts). Existing mounts on clients hang, as do
new mounts from new clients. Rebooting the server back to rc7
everything recovers.
The commit b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after
checking populated elements") returns the wrong value if the array is
already populated which is interpreted as an allocation failure. Dave
reported this fixes his problem and it also passed a test running dbench
over NFS.
Link: https://lkml.kernel.org/r/20210628150219.GC3840@techsingularity.net
Fixes: b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements")
Signed-off-by: Mel Gorman <mgorman(a)techsingularity.net>
Reported-by: Dave Jones <davej(a)codemonkey.org.uk>
Tested-by: Dave Jones <davej(a)codemonkey.org.uk>
Cc: Dan Carpenter <dan.carpenter(a)oracle.com>
Cc: Jesper Dangaard Brouer <brouer(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [5.13+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page_alloc.c~mm-page_alloc-correct-return-value-of-populated-elements-if-bulk-array-is-populated
+++ a/mm/page_alloc.c
@@ -5058,7 +5058,7 @@ unsigned long __alloc_pages_bulk(gfp_t g
/* Already populated array? */
if (unlikely(page_array && nr_pages - nr_populated == 0))
- return 0;
+ return nr_populated;
/* Use the single page allocator for one page. */
if (nr_pages - nr_populated == 1)
_
Patches currently in -mm which might be from mgorman(a)techsingularity.net are
The patch titled
Subject: mm/page_alloc: fix memory map initialization for descending nodes
has been removed from the -mm tree. Its filename was
mm-page_alloc-fix-memory-map-initialization-for-descending-nodes.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Mike Rapoport <rppt(a)linux.ibm.com>
Subject: mm/page_alloc: fix memory map initialization for descending nodes
On systems with memory nodes sorted in descending order, for instance Dell
Precision WorkStation T5500, the struct pages for higher PFNs and
respectively lower nodes, could be overwritten by the initialization of
struct pages corresponding to the holes in the memory sections.
For example for the below memory layout
[ 0.245624] Early memory node ranges
[ 0.248496] node 1: [mem 0x0000000000001000-0x0000000000090fff]
[ 0.251376] node 1: [mem 0x0000000000100000-0x00000000dbdf8fff]
[ 0.254256] node 1: [mem 0x0000000100000000-0x0000001423ffffff]
[ 0.257144] node 0: [mem 0x0000001424000000-0x0000002023ffffff]
the range 0x1424000000 - 0x1428000000 in the beginning of node 0 starts in
the middle of a section and will be considered as a hole during the
initialization of the last section in node 1.
The wrong initialization of the memory map causes panic on boot when
CONFIG_DEBUG_VM is enabled.
Reorder loop order of the memory map initialization so that the outer loop
will always iterate over populated memory regions in the ascending order
and the inner loop will select the zone corresponding to the PFN range.
This way initialization of the struct pages for the memory holes will be
always done for the ranges that are actually not populated.
[akpm(a)linux-foundation.org: coding style fixes]
Link: https://lkml.kernel.org/r/YNXlMqBbL+tBG7yq@kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=213073
Link: https://lkml.kernel.org/r/20210624062305.10940-1-rppt@kernel.org
Fixes: 0740a50b9baa ("mm/page_alloc.c: refactor initialization of struct page for holes in memory layout")
Signed-off-by: Mike Rapoport <rppt(a)linux.ibm.com>
Cc: Boris Petkov <bp(a)alien8.de>
Cc: Robert Shteynfeld <robert.shteynfeld(a)gmail.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 1
mm/page_alloc.c | 96 ++++++++++++++++++++++++++-----------------
2 files changed, 59 insertions(+), 38 deletions(-)
--- a/include/linux/mm.h~mm-page_alloc-fix-memory-map-initialization-for-descending-nodes
+++ a/include/linux/mm.h
@@ -2474,7 +2474,6 @@ extern void set_dma_reserve(unsigned lon
extern void memmap_init_range(unsigned long, int, unsigned long,
unsigned long, unsigned long, enum meminit_context,
struct vmem_altmap *, int migratetype);
-extern void memmap_init_zone(struct zone *zone);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
--- a/mm/page_alloc.c~mm-page_alloc-fix-memory-map-initialization-for-descending-nodes
+++ a/mm/page_alloc.c
@@ -6400,7 +6400,7 @@ void __ref memmap_init_zone_device(struc
return;
/*
- * The call to memmap_init_zone should have already taken care
+ * The call to memmap_init should have already taken care
* of the pages reserved for the memmap, so we can just jump to
* the end of that region and start processing the device pages.
*/
@@ -6465,7 +6465,7 @@ static void __meminit zone_init_free_lis
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
- * memmap_init_zone().
+ * memmap_init_zone_range().
*
* But, there could be struct pages that correspond to holes in
* memblock.memory. This can happen because of the following reasons:
@@ -6484,9 +6484,9 @@ static void __meminit zone_init_free_lis
* zone/node above the hole except for the trailing pages in the last
* section that will be appended to the zone/node below.
*/
-static u64 __meminit init_unavailable_range(unsigned long spfn,
- unsigned long epfn,
- int zone, int node)
+static void __init init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
{
unsigned long pfn;
u64 pgcnt = 0;
@@ -6502,56 +6502,77 @@ static u64 __meminit init_unavailable_ra
pgcnt++;
}
- return pgcnt;
+ if (pgcnt)
+ pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
+ node, zone_names[zone], pgcnt);
}
#else
-static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
- int zone, int node)
+static inline void init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
{
- return 0;
}
#endif
-void __meminit __weak memmap_init_zone(struct zone *zone)
+static void __init memmap_init_zone_range(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long *hole_pfn)
{
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
- int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
- static unsigned long hole_pfn;
+ int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (*hole_pfn < start_pfn)
+ init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
+
+ *hole_pfn = end_pfn;
+}
+
+static void __init memmap_init(void)
+{
unsigned long start_pfn, end_pfn;
- u64 pgcnt = 0;
+ unsigned long hole_pfn = 0;
+ int i, j, zone_id, nid;
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
- end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ struct pglist_data *node = NODE_DATA(nid);
+
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone *zone = node->node_zones + j;
+
+ if (!populated_zone(zone))
+ continue;
- if (end_pfn > start_pfn)
- memmap_init_range(end_pfn - start_pfn, nid,
- zone_id, start_pfn, zone_end_pfn,
- MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
-
- if (hole_pfn < start_pfn)
- pgcnt += init_unavailable_range(hole_pfn, start_pfn,
- zone_id, nid);
- hole_pfn = end_pfn;
+ memmap_init_zone_range(zone, start_pfn, end_pfn,
+ &hole_pfn);
+ zone_id = j;
+ }
}
#ifdef CONFIG_SPARSEMEM
/*
- * Initialize the hole in the range [zone_end_pfn, section_end].
- * If zone boundary falls in the middle of a section, this hole
- * will be re-initialized during the call to this function for the
- * higher zone.
+ * Initialize the memory map for hole in the range [memory_end,
+ * section_end].
+ * Append the pages in this hole to the highest zone in the last
+ * node.
+ * The call to init_unavailable_range() is outside the ifdef to
+ * silence the compiler warining about zone_id set but not used;
+ * for FLATMEM it is a nop anyway
*/
- end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
+ end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
if (hole_pfn < end_pfn)
- pgcnt += init_unavailable_range(hole_pfn, end_pfn,
- zone_id, nid);
#endif
-
- if (pgcnt)
- pr_info(" %s zone: %llu pages in unavailable ranges\n",
- zone->name, pgcnt);
+ init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
static int zone_batchsize(struct zone *zone)
@@ -7254,7 +7275,6 @@ static void __init free_area_init_core(s
set_pageblock_order();
setup_usemap(zone);
init_currently_empty_zone(zone, zone->zone_start_pfn, size);
- memmap_init_zone(zone);
}
}
@@ -7780,6 +7800,8 @@ void __init free_area_init(unsigned long
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
+
+ memmap_init();
}
static int __init cmdline_parse_core(char *p, unsigned long *core,
_
Patches currently in -mm which might be from rppt(a)linux.ibm.com are
mmap-make-mlock_future_check-global.patch
riscv-kconfig-make-direct-map-manipulation-options-depend-on-mmu.patch
set_memory-allow-querying-whether-set_direct_map_-is-actually-enabled.patch
mm-introduce-memfd_secret-system-call-to-create-secret-memory-areas.patch
pm-hibernate-disable-when-there-are-active-secretmem-users.patch
arch-mm-wire-up-memfd_secret-system-call-where-relevant.patch
secretmem-test-add-basic-selftest-for-memfd_secret2.patch
The patch titled
Subject: mm/gup: fix try_grab_compound_head() race with split_huge_page()
has been removed from the -mm tree. Its filename was
mm-gup-fix-try_grab_compound_head-race-with-split_huge_page.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Jann Horn <jannh(a)google.com>
Subject: mm/gup: fix try_grab_compound_head() race with split_huge_page()
try_grab_compound_head() is used to grab a reference to a page from
get_user_pages_fast(), which is only protected against concurrent freeing
of page tables (via local_irq_save()), but not against concurrent TLB
flushes, freeing of data pages, or splitting of compound pages.
Because no reference is held to the page when try_grab_compound_head() is
called, the page may have been freed and reallocated by the time its
refcount has been elevated; therefore, once we're holding a stable
reference to the page, the caller re-checks whether the PTE still points
to the same page (with the same access rights).
The problem is that try_grab_compound_head() has to grab a reference on
the head page; but between the time we look up what the head page is and
the time we actually grab a reference on the head page, the compound page
may have been split up (either explicitly through split_huge_page() or by
freeing the compound page to the buddy allocator and then allocating its
individual order-0 pages). If that happens, get_user_pages_fast() may end
up returning the right page but lifting the refcount on a now-unrelated
page, leading to use-after-free of pages.
To fix it: Re-check whether the pages still belong together after lifting
the refcount on the head page. Move anything else that checks
compound_head(page) below the refcount increment.
This can't actually happen on bare-metal x86 (because there, disabling
IRQs locks out remote TLB flushes), but it can happen on virtualized x86
(e.g. under KVM) and probably also on arm64. The race window is pretty
narrow, and constantly allocating and shattering hugepages isn't exactly
fast; for now I've only managed to reproduce this in an x86 KVM guest with
an artificially widened timing window (by adding a loop that repeatedly
calls `inl(0x3f8 + 5)` in `try_get_compound_head()` to force VM exits, so
that PV TLB flushes are used instead of IPIs).
As requested on the list, also replace the existing VM_BUG_ON_PAGE() with
a warning and bailout. Since the existing code only performed the BUG_ON
check on DEBUG_VM kernels, ensure that the new code also only performs the
check under that configuration - I don't want to mix two logically
separate changes together too much. The macro VM_WARN_ON_ONCE_PAGE()
doesn't return a value on !DEBUG_VM, so wrap the whole check in an #ifdef
block. An alternative would be to change the VM_WARN_ON_ONCE_PAGE()
definition for !DEBUG_VM such that it always returns false, but since that
would differ from the behavior of the normal WARN macros, it might be too
confusing for readers.
Link: https://lkml.kernel.org/r/20210615012014.1100672-1-jannh@google.com
Fixes: 7aef4172c795 ("mm: handle PTE-mapped tail pages in gerneric fast gup implementaiton")
Signed-off-by: Jann Horn <jannh(a)google.com>
Reviewed-by: John Hubbard <jhubbard(a)nvidia.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Kirill A. Shutemov <kirill(a)shutemov.name>
Cc: Jan Kara <jack(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/gup.c | 58 +++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 43 insertions(+), 15 deletions(-)
--- a/mm/gup.c~mm-gup-fix-try_grab_compound_head-race-with-split_huge_page
+++ a/mm/gup.c
@@ -44,6 +44,23 @@ static void hpage_pincount_sub(struct pa
atomic_sub(refs, compound_pincount_ptr(page));
}
+/* Equivalent to calling put_page() @refs times. */
+static void put_page_refs(struct page *page, int refs)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
+ return;
+#endif
+
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
/*
* Return the compound head page with ref appropriately incremented,
* or NULL if that failed.
@@ -56,6 +73,21 @@ static inline struct page *try_get_compo
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
+
+ /*
+ * At this point we have a stable reference to the head page; but it
+ * could be that between the compound_head() lookup and the refcount
+ * increment, the compound page was split, in which case we'd end up
+ * holding a reference on a page that has nothing to do with the page
+ * we were given anymore.
+ * So now that the head page is stable, recheck that the pages still
+ * belong together.
+ */
+ if (unlikely(compound_head(page) != head)) {
+ put_page_refs(head, refs);
+ return NULL;
+ }
+
return head;
}
@@ -96,6 +128,14 @@ __maybe_unused struct page *try_grab_com
return NULL;
/*
+ * CAUTION: Don't use compound_head() on the page before this
+ * point, the result won't be stable.
+ */
+ page = try_get_compound_head(page, refs);
+ if (!page)
+ return NULL;
+
+ /*
* When pinning a compound page of order > 1 (which is what
* hpage_pincount_available() checks for), use an exact count to
* track it, via hpage_pincount_add/_sub().
@@ -103,15 +143,10 @@ __maybe_unused struct page *try_grab_com
* However, be sure to *also* increment the normal page refcount
* field at least once, so that the page really is pinned.
*/
- if (!hpage_pincount_available(page))
- refs *= GUP_PIN_COUNTING_BIAS;
-
- page = try_get_compound_head(page, refs);
- if (!page)
- return NULL;
-
if (hpage_pincount_available(page))
hpage_pincount_add(page, refs);
+ else
+ page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
orig_refs);
@@ -135,14 +170,7 @@ static void put_compound_head(struct pag
refs *= GUP_PIN_COUNTING_BIAS;
}
- VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
- /*
- * Calling put_page() for each ref is unnecessarily slow. Only the last
- * ref needs a put_page().
- */
- if (refs > 1)
- page_ref_sub(page, refs - 1);
- put_page(page);
+ put_page_refs(page, refs);
}
/**
_
Patches currently in -mm which might be from jannh(a)google.com are
Function ceph_check_delayed_caps() is called from the mdsc->delayed_work
workqueue and it can be kept looping for quite some time if caps keep
being added back to the mdsc->cap_delay_list. This may result in the
watchdog tainting the kernel with the softlockup flag.
This patch breaks this loop if the caps have been recently (i.e. during
the loop execution). Any new caps added to the list will be handled in
the next run.
Cc: stable(a)vger.kernel.org
Link: https://tracker.ceph.com/issues/46284
Signed-off-by: Luis Henriques <lhenriques(a)suse.de>
---
fs/ceph/caps.c | 17 ++++++++++++++++-
fs/ceph/mds_client.c | 7 ++++---
fs/ceph/super.h | 2 +-
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a5e93b185515..c79b8dff25d7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4224,11 +4224,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/*
* Delayed work handler to process end of delayed cap release LRU list.
+ *
+ * If new caps are added to the list while processing it, these won't get
+ * processed in this run. In this case, the ci->i_hold_caps_max will be
+ * returned so that the work can be scheduled accordingly.
*/
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
+ struct ceph_mount_options *opt = mdsc->fsc->mount_options;
+ unsigned long delay_max = opt->caps_wanted_delay_max * HZ;
+ unsigned long loop_start = jiffies;
+ unsigned long delay = 0;
dout("check_delayed_caps\n");
spin_lock(&mdsc->cap_delay_lock);
@@ -4236,6 +4244,11 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
ci = list_first_entry(&mdsc->cap_delay_list,
struct ceph_inode_info,
i_cap_delay_list);
+ if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) {
+ dout("%s caps added recently. Exiting loop", __func__);
+ delay = ci->i_hold_caps_max;
+ break;
+ }
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
@@ -4252,6 +4265,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
}
}
spin_unlock(&mdsc->cap_delay_lock);
+
+ return delay;
}
/*
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f5dc58a05f9f..a6f985786d68 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4519,11 +4519,12 @@ static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
static void delayed_work(struct work_struct *work)
{
- int i;
struct ceph_mds_client *mdsc =
container_of(work, struct ceph_mds_client, delayed_work.work);
+ unsigned long delay;
int renew_interval;
int renew_caps;
+ int i;
dout("mdsc delayed_work\n");
@@ -4563,7 +4564,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_check_delayed_caps(mdsc);
+ delay = ceph_check_delayed_caps(mdsc);
ceph_queue_cap_reclaim_work(mdsc);
@@ -4571,7 +4572,7 @@ static void delayed_work(struct work_struct *work)
maybe_recover_session(mdsc);
- schedule_delayed(mdsc, 0);
+ schedule_delayed(mdsc, delay);
}
int ceph_mdsc_init(struct ceph_fs_client *fsc)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 839e6b0239ee..3b5207c82767 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1170,7 +1170,7 @@ extern void ceph_flush_snaps(struct ceph_inode_info *ci,
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
From: Tian Tao <tiantao6(a)hisilicon.com>
[ Upstream commit 7d614ab2f20503ed8766363d41f8607337571adf ]
fixed the below warning:
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c:84:2-8: WARNING: NULL check
before some freeing functions is not needed.
Signed-off-by: Tian Tao <tiantao6(a)hisilicon.com>
Acked-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Lucas Stach <l.stach(a)pengutronix.de>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index 880b95511b98..1faa3da8c517 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -86,8 +86,7 @@ static void etnaviv_gem_prime_release(struct etnaviv_gem_object *etnaviv_obj)
/* Don't drop the pages for imported dmabuf, as they are not
* ours, just free the array we allocated:
*/
- if (etnaviv_obj->pages)
- kvfree(etnaviv_obj->pages);
+ kvfree(etnaviv_obj->pages);
drm_prime_gem_destroy(&etnaviv_obj->base, etnaviv_obj->sgt);
}
--
2.30.2
From: Tian Tao <tiantao6(a)hisilicon.com>
[ Upstream commit 7d614ab2f20503ed8766363d41f8607337571adf ]
fixed the below warning:
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c:84:2-8: WARNING: NULL check
before some freeing functions is not needed.
Signed-off-by: Tian Tao <tiantao6(a)hisilicon.com>
Acked-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Lucas Stach <l.stach(a)pengutronix.de>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index f24dd21c2363..9e657a096f45 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -77,8 +77,7 @@ static void etnaviv_gem_prime_release(struct etnaviv_gem_object *etnaviv_obj)
/* Don't drop the pages for imported dmabuf, as they are not
* ours, just free the array we allocated:
*/
- if (etnaviv_obj->pages)
- kvfree(etnaviv_obj->pages);
+ kvfree(etnaviv_obj->pages);
drm_prime_gem_destroy(&etnaviv_obj->base, etnaviv_obj->sgt);
}
--
2.30.2
From: Tian Tao <tiantao6(a)hisilicon.com>
[ Upstream commit 7d614ab2f20503ed8766363d41f8607337571adf ]
fixed the below warning:
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c:84:2-8: WARNING: NULL check
before some freeing functions is not needed.
Signed-off-by: Tian Tao <tiantao6(a)hisilicon.com>
Acked-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Lucas Stach <l.stach(a)pengutronix.de>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index b390dd4d60b7..d741b1d735f7 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -80,8 +80,7 @@ static void etnaviv_gem_prime_release(struct etnaviv_gem_object *etnaviv_obj)
/* Don't drop the pages for imported dmabuf, as they are not
* ours, just free the array we allocated:
*/
- if (etnaviv_obj->pages)
- kvfree(etnaviv_obj->pages);
+ kvfree(etnaviv_obj->pages);
drm_prime_gem_destroy(&etnaviv_obj->base, etnaviv_obj->sgt);
}
--
2.30.2
xfrm_bydst_resize() calls synchronize_rcu() while holding
hash_resize_mutex. But then on PREEMPT_RT configurations,
xfrm_policy_lookup_bytype() may acquire that mutex while running in an
RCU read side critical section. This results in a deadlock.
In fact the scope of hash_resize_mutex is way beyond the purpose of
xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy
for a given destination/direction, along with other details.
The lower level net->xfrm.xfrm_policy_lock, which among other things
protects per destination/direction references to policy entries, is
enough to serialize and benefit from priority inheritance against the
write side. As a bonus, it makes it officially a per network namespace
synchronization business where a policy table resize on namespace A
shouldn't block a policy lookup on namespace B.
Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock)
Cc: stable(a)vger.kernel.org
Cc: Ahmed S. Darwish <a.darwish(a)linutronix.de>
Cc: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Varad Gautam <varad.gautam(a)suse.com>
Cc: Steffen Klassert <steffen.klassert(a)secunet.com>
Cc: Herbert Xu <herbert(a)gondor.apana.org.au>
Cc: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
---
include/net/netns/xfrm.h | 1 +
net/xfrm/xfrm_policy.c | 17 ++++++++---------
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index e816b6a3ef2b..9b376b87bd54 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -74,6 +74,7 @@ struct netns_xfrm {
#endif
spinlock_t xfrm_state_lock;
seqcount_spinlock_t xfrm_state_hash_generation;
+ seqcount_spinlock_t xfrm_policy_hash_generation;
spinlock_t xfrm_policy_lock;
struct mutex xfrm_cfg_mutex;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ce500f847b99..46a6d15b66d6 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -155,7 +155,6 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
__read_mostly;
static struct kmem_cache *xfrm_dst_cache __ro_after_init;
-static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation;
static struct rhashtable xfrm_policy_inexact_table;
static const struct rhashtable_params xfrm_pol_inexact_params;
@@ -585,7 +584,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
return;
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- write_seqcount_begin(&xfrm_policy_hash_generation);
+ write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
@@ -596,7 +595,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
net->xfrm.policy_bydst[dir].hmask = nhashmask;
- write_seqcount_end(&xfrm_policy_hash_generation);
+ write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
synchronize_rcu();
@@ -1245,7 +1244,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
spin_lock_bh(&net->xfrm.xfrm_policy_lock);
- write_seqcount_begin(&xfrm_policy_hash_generation);
+ write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
/* make sure that we can insert the indirect policies again before
* we start with destructive action.
@@ -1354,7 +1353,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
out_unlock:
__xfrm_policy_inexact_flush(net);
- write_seqcount_end(&xfrm_policy_hash_generation);
+ write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
mutex_unlock(&hash_resize_mutex);
@@ -2095,9 +2094,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
rcu_read_lock();
retry:
do {
- sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+ sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
chain = policy_hash_direct(net, daddr, saddr, family, dir);
- } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+ } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
@@ -2128,7 +2127,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
}
skip_inexact:
- if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+ if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
goto retry;
if (ret && !xfrm_pol_hold_rcu(ret))
@@ -4084,6 +4083,7 @@ static int __net_init xfrm_net_init(struct net *net)
/* Initialize the per-net locks here */
spin_lock_init(&net->xfrm.xfrm_state_lock);
spin_lock_init(&net->xfrm.xfrm_policy_lock);
+ seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
mutex_init(&net->xfrm.xfrm_cfg_mutex);
rv = xfrm_statistics_init(net);
@@ -4128,7 +4128,6 @@ void __init xfrm_init(void)
{
register_pernet_subsys(&xfrm_net_ops);
xfrm_dev_init();
- seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex);
xfrm_input_init();
#ifdef CONFIG_XFRM_ESPINTCP
--
2.25.1
From: Jérôme Glisse <jglisse(a)redhat.com>
We need to append device id even if eeprom have a label property set as some
platform can have multiple eeproms with same label and we can not register
each of those with same label. Failing to register those eeproms trigger
cascade failures on such platform (system is no longer working).
This fix regression on such platform introduced with 4e302c3b568e
Signed-off-by: Jérôme Glisse <jglisse(a)redhat.com>
Cc: Diego Santa Cruz <Diego.SantaCruz(a)spinetix.com>
Cc: Bartosz Golaszewski <bgolaszewski(a)baylibre.com>
Cc: Jon Hunter <jonathanh(a)nvidia.com>
Cc: stable(a)vger.kernel.org
Cc: linux-i2c(a)vger.kernel.org
---
drivers/misc/eeprom/at24.c | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/drivers/misc/eeprom/at24.c b/drivers/misc/eeprom/at24.c
index 7a6f01ace78a..305ffad131a2 100644
--- a/drivers/misc/eeprom/at24.c
+++ b/drivers/misc/eeprom/at24.c
@@ -714,23 +714,20 @@ static int at24_probe(struct i2c_client *client)
}
/*
- * If the 'label' property is not present for the AT24 EEPROM,
- * then nvmem_config.id is initialised to NVMEM_DEVID_AUTO,
- * and this will append the 'devid' to the name of the NVMEM
- * device. This is purely legacy and the AT24 driver has always
- * defaulted to this. However, if the 'label' property is
- * present then this means that the name is specified by the
- * firmware and this name should be used verbatim and so it is
- * not necessary to append the 'devid'.
+ * We initialize nvmem_config.id to NVMEM_DEVID_AUTO even if the
+ * label property is set as some platform can have multiple eeproms
+ * with same label and we can not register each of those with same
+ * label. Failing to register those eeproms trigger cascade failure
+ * on such platform.
*/
+ nvmem_config.id = NVMEM_DEVID_AUTO;
+
if (device_property_present(dev, "label")) {
- nvmem_config.id = NVMEM_DEVID_NONE;
err = device_property_read_string(dev, "label",
&nvmem_config.name);
if (err)
return err;
} else {
- nvmem_config.id = NVMEM_DEVID_AUTO;
nvmem_config.name = dev_name(dev);
}
--
2.31.1
Recently we encounter multi #MC on the same task when it's
task_work_run() has not been called, current->mce_kill_me was
added to task_works list more than once, that make a circular
linked task_works, so task_work_run() will do a endless loop.
More seriously, the SIGBUS signal can not be delivered to the
userspace task which tigger the #MC and I met #MC flood.
I borrowed mce_kill_me.func to check whether current->mce_kill_me
has been added to task_works, prevent duplicate addition. When
work function be called, the task_works must has been taken,
so it is safe to be cleared in callback.
Fixed: commit 5567d11c21a1 ("x86/mce: Send #MC singal from task work")
Cc: <stable(a)vger.kernel.org> #v5.8+
Signed-off-by: Ding Hui <dinghui(a)sangfor.com.cn>
---
arch/x86/kernel/cpu/mce/core.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 22791aadc085..32fb9ded6b85 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1250,6 +1250,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
static void kill_me_now(struct callback_head *ch)
{
+ WRITE_ONCE(ch->func, NULL);
force_sig(SIGBUS);
}
@@ -1259,6 +1260,8 @@ static void kill_me_maybe(struct callback_head *cb)
int flags = MF_ACTION_REQUIRED;
int ret;
+ WRITE_ONCE(cb->func, NULL);
+
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
@@ -1289,17 +1292,20 @@ static void kill_me_maybe(struct callback_head *cb)
static void queue_task_work(struct mce *m, int kill_current_task)
{
+ struct callback_head ch;
+
current->mce_addr = m->addr;
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
if (kill_current_task)
- current->mce_kill_me.func = kill_me_now;
+ ch.func = kill_me_now;
else
- current->mce_kill_me.func = kill_me_maybe;
+ ch.func = kill_me_maybe;
- task_work_add(current, ¤t->mce_kill_me, TWA_RESUME);
+ if (!cmpxchg(¤t->mce_kill_me.func, NULL, ch.func))
+ task_work_add(current, ¤t->mce_kill_me, TWA_RESUME);
}
/*
--
2.17.1
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
[ Upstream commit 0e793ba77c18382f08e440260fe72bc6fce2a3cb ]
Currently, the SPI core doesn't set the struct device fwnode pointer
when it creates a new SPI device. This means when the device is
registered the fwnode is NULL and the check in device_add which sets
the fwnode->dev pointer is skipped. This wasn't previously an issue,
however these two patches:
commit 4731210c09f5 ("gpiolib: Bind gpio_device to a driver to enable
fw_devlink=on by default")
commit ced2af419528 ("gpiolib: Don't probe gpio_device if it's not the
primary device")
Added some code to the GPIO core which relies on using that
fwnode->dev pointer to determine if a driver is bound to the fwnode
and if not bind a stub GPIO driver. This means the GPIO providers
behind SPI will get both the expected driver and this stub driver
causing the stub driver to fail if it attempts to request any pin
configuration. For example on my system:
madera-pinctrl madera-pinctrl: pin gpio5 already requested by madera-pinctrl; cannot claim for gpiochip3
madera-pinctrl madera-pinctrl: pin-4 (gpiochip3) status -22
madera-pinctrl madera-pinctrl: could not request pin 4 (gpio5) from group aif1 on device madera-pinctrl
gpio_stub_drv gpiochip3: Error applying setting, reverse things back
gpio_stub_drv: probe of gpiochip3 failed with error -22
The firmware node on the device created by the GPIO framework is set
through the of_node pointer hence things generally actually work,
however that fwnode->dev is never set, as the check was skipped at
device_add time. This fix appears to match how the I2C subsystem
handles the same situation.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210421101402.8468-1-ckeepax@opensource.cirrus.c…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index e067c54e87dd..789354ee6a11 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2066,6 +2066,7 @@ of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc)
/* Store a pointer to the node in the device structure */
of_node_get(nc);
spi->dev.of_node = nc;
+ spi->dev.fwnode = of_fwnode_handle(nc);
/* Register the new device */
rc = spi_add_device(spi);
--
2.30.2
From: Joerg Roedel <jroedel(a)suse.de>
For now, kexec is not supported when running as an SEV-ES guest. Doing
so requires additional hypervisor support and special code to hand
over the CPUs to the new kernel in a safe way.
Until this is implemented, do not support kexec in SEV-ES guests.
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
arch/x86/kernel/machine_kexec_64.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index c078b0d3ab0e..f902cc9cc634 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -620,3 +620,11 @@ void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
*/
set_memory_encrypted((unsigned long)vaddr, pages);
}
+
+/*
+ * Kexec is not supported in SEV-ES guests yet
+ */
+bool arch_kexec_supported(void)
+{
+ return !sev_es_active();
+}
--
2.31.1
From: Joerg Roedel <jroedel(a)suse.de>
Allow a runtime opt-out of kexec support for architecture code in case
the kernel is running in an environment where kexec is not properly
supported yet.
This will be used on x86 when the kernel is running as an SEV-ES
guest. SEV-ES guests need special handling for kexec to hand over all
CPUs to the new kernel. This requires special hypervisor support and
handling code in the guest which is not yet implemented.
Cc: stable(a)vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
include/linux/kexec.h | 2 ++
kernel/kexec.c | 14 ++++++++++++++
kernel/kexec_file.c | 9 +++++++++
3 files changed, 25 insertions(+)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0c994ae37729..400aae677435 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -422,6 +422,8 @@ static inline int kexec_crash_loaded(void) { return 0; }
#define kexec_in_progress false
#endif /* CONFIG_KEXEC_CORE */
+bool arch_kexec_supported(void);
+
#endif /* !defined(__ASSEBMLY__) */
#endif /* LINUX_KEXEC_H */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c06f051..d03134160458 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -195,11 +195,25 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
* that to happen you need to do that yourself.
*/
+bool __weak arch_kexec_supported(void)
+{
+ return true;
+}
+
static inline int kexec_load_check(unsigned long nr_segments,
unsigned long flags)
{
int result;
+ /*
+ * The architecture may support kexec in general, but the kernel could
+ * run in an environment where it is not (yet) possible to execute a new
+ * kernel. Allow the architecture code to opt-out of kexec support when
+ * it is running in such an environment.
+ */
+ if (!arch_kexec_supported())
+ return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..96d08a512e9c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -358,6 +358,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
int ret = 0, i;
struct kimage **dest_image, *image;
+ /*
+ * The architecture may support kexec in general, but the kernel could
+ * run in an environment where it is not (yet) possible to execute a new
+ * kernel. Allow the architecture code to opt-out of kexec support when
+ * it is running in such an environment.
+ */
+ if (!arch_kexec_supported())
+ return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
--
2.31.1
In the unlikely event that setting the software flow-control characters
fails the other flow-control settings should still be updated (just like
all other terminal settings).
Move out the error message printed by the set_chars() helper to make it
more obvious that this is intentional.
Fixes: 7748feffcd80 ("USB: serial: cp210x: add support for software flow control")
Cc: stable(a)vger.kernel.org # 5.11
Signed-off-by: Johan Hovold <johan(a)kernel.org>
---
drivers/usb/serial/cp210x.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 09b845d0da41..fd198031de71 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -1163,10 +1163,8 @@ static int cp210x_set_chars(struct usb_serial_port *port,
kfree(dmabuf);
- if (result < 0) {
- dev_err(&port->dev, "failed to set special chars: %d\n", result);
+ if (result < 0)
return result;
- }
return 0;
}
@@ -1218,8 +1216,10 @@ static void cp210x_set_flow_control(struct tty_struct *tty,
chars.bXoffChar = STOP_CHAR(tty);
ret = cp210x_set_chars(port, &chars);
- if (ret)
- return;
+ if (ret) {
+ dev_err(&port->dev, "failed to set special chars: %d\n",
+ ret);
+ }
}
mutex_lock(&port_priv->mutex);
--
2.31.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From edc0b0bccc9c80d9a44d3002dcca94984b25e7cf Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch(a)nvidia.com>
Date: Mon, 7 Jun 2021 11:03:12 +0300
Subject: [PATCH] RDMA/mlx5: Block FDB rules when not in switchdev mode
Allow creating FDB steering rules only when in switchdev mode.
The only software model where a userspace application can manipulate
FDB entries is when it manages the eswitch. This is only possible in
switchdev mode where we expose a single RDMA device with representors
for all the vports that are connected to the eswitch.
Fixes: 52438be44112 ("RDMA/mlx5: Allow inserting a steering rule to the FDB")
Link: https://lore.kernel.org/r/e928ae7c58d07f104716a2a8d730963d1bd01204.16230529…
Reviewed-by: Maor Gottlieb <maorg(a)nvidia.com>
Signed-off-by: Mark Bloch <mbloch(a)nvidia.com>
Signed-off-by: Leon Romanovsky <leonro(a)nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 2fc6a60c4e77..f84441ff0c81 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -2134,6 +2134,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
if (err)
goto end;
+ if (obj->ns_type == MLX5_FLOW_NAMESPACE_FDB &&
+ mlx5_eswitch_mode(dev->mdev) != MLX5_ESWITCH_OFFLOADS) {
+ err = -EINVAL;
+ goto end;
+ }
+
uobj->object = obj;
obj->mdev = dev->mdev;
atomic_set(&obj->usecnt, 0);
Hello,
I request the following patch from v4.10-rc1 to get cherry-picked into
"stable/linux-4.9.y":
> commit f114dca2533ca770aebebffb5ed56e5e7d1fb3fb
> Author: Alexander Duyck <alexander.h.duyck(a)intel.com>
> Date: Tue Oct 25 16:08:46 2016 -0700
>
> i40e: Be much more verbose about what we can and cannot offload
>
> This change makes it so that we are much more robust about defining what we
> can and cannot offload. Previously we were just checking for the L4 tunnel
> header length, however there are other fields we should be verifying as
> there are multiple scenarios in which we cannot perform hardware offloads.
>
> In addition the device only supports GSO as long as the MSS is 64 or
> greater. We were not checking this so an MSS less than that was resulting
> in Tx hangs.
>
> Change-ID: I5e2fd5f3075c73601b4b36327b771c64fcb6c31b
> Signed-off-by: Alexander Duyck <alexander.h.duyck(a)intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers(a)intel.com>
Debian had this old Bug
<https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=892105> reported
against 4.9.82, which still exists in Debians old-stable 9 "Stretch"
current kernel 4.9.258, but also with latest stable 4.9.273.
Our environment
===============
- KVM server
- dual port i40e
- classic bridge with enp96s0f0
- VM attached to bridge via veth
- no VLANs
- no MacVLan
> # ethtool -i enp96s0f0
> driver: i40e
> version: 1.6.16-k
> firmware-version: 3.33 0x80000e48 1.1876.0
> expansion-rom-version:
> bus-info: 0000:60:00.0
> supports-statistics: yes
> supports-test: yes
> supports-eeprom-access: yes
> supports-register-dump: yes
> supports-priv-flags: ye
> # lspci -s 0000:60:00.0
> 60:00.0 Ethernet controller: Intel Corporation Ethernet Connection X722 for 10GBASE-T (rev 09)
Analysis
========
As soon as we start one of our "Ubuntu" images the bridge stops
receiving unicast packages for *all* VMs on that bridge.
- we still see outgoing traffic leaving the host, e.g. ARP requests
- "tcpdump -i enp96s0f0" shows no incoming unicast traffic, e.g. no ARP
response
- broadcast traffic passes the bridge
- VMs on the same bridge can communicate with each other
Most often I see the following error message after doing `dmesg -n 8`:
> [ +9,376367] i40e 0000:60:00.0: cleared PE_CRITERR
> [ +0,000252] i40e 0000:60:00.0: TX driver issue detected, PF reset issued
> [ +0,859912] i40e 0000:60:00.0: Error I40E_AQ_RC_EINVAL adding RX filters on PF, promiscuous mode forced on
In one case I've seen this also (don't know if it is relevant):
> [ 218.921466] i40e 0000:60:00.0 enp96s0f0: VSI_seid 390, Hung TX queue 43, tx_pending_hw: 1, NTC:0xa6, HWB: 0xa6, NTU: 0xa7, TAIL: 0xa7
> [ 218.921470] i40e 0000:60:00.0 enp96s0f0: VSI_seid 390, Issuing force_wb for TX queue 43, Interrupt Reg: 0x0
After that error the only way to reset this broken state it to reboot
the host. I've been unable to tear down the bridge and/or remove the
`i40e` driver, which most often crashes the Linux kernel (some other bug
on `ip link set enp96s0f0 nomaster`).
If you need more data I have a PCAP file, but I still don't know which
packet exactly triggers the bug.
The bugs seems to be fixed with 4.10.0 and I bisected it down to
> git bisect start '--' 'drivers/net/ethernet/intel/i40e'
> # new: [c470abd4fde40ea6a0846a2beab642a578c0b8cd] Linux 4.10
> git bisect new c470abd4fde40ea6a0846a2beab642a578c0b8cd
> # old: [69973b830859bc6529a7a0468ba0d80ee5117826] Linux 4.9
> git bisect old 69973b830859bc6529a7a0468ba0d80ee5117826
> # old: [13fd3f9cc3def8b276c7913ae4acbfa2653cb198] i40e: clear mac filter count on reset
> git bisect old 13fd3f9cc3def8b276c7913ae4acbfa2653cb198
> # new: [7ec9ba11b046b4b7fd768c366870ada60d409295] i40e: Driver prints log message on link speed change
> git bisect new 7ec9ba11b046b4b7fd768c366870ada60d409295
> # new: [0b7c8b5d5436317a5f4509e2a150c6cec017f348] i40e: fix trivial typo in naming of i40e_sync_filters_subtask
> git bisect new 0b7c8b5d5436317a5f4509e2a150c6cec017f348
> # new: [f114dca2533ca770aebebffb5ed56e5e7d1fb3fb] i40e: Be much more verbose about what we can and cannot offload
> git bisect new f114dca2533ca770aebebffb5ed56e5e7d1fb3fb
> # old: [81fa7c97bebd6e1a52c4e059eeffe18df5b3f11f] i40e: Implementation of ERROR state for NVM update state machine
> git bisect old 81fa7c97bebd6e1a52c4e059eeffe18df5b3f11f
> # old: [3aa7b74dbeedfb32406fec70cfd76d797209e8c9] i40e: removed unreachable code
> git bisect old 3aa7b74dbeedfb32406fec70cfd76d797209e8c9
> # first new commit: [f114dca2533ca770aebebffb5ed56e5e7d1fb3fb] i40e: Be much more verbose about what we can and cannot offload
I used v4.10 as the basis and only bisected everything in
drivers/net/ethernet/intel/i40e/ as vanilla v4.9 and several other
versions between that and v4.10 crashed my host, so basically
git checkout v4.10
git checkout $hash -- drivers/net/ethernet/intel/i40e/
make all modules_install install
git checkout v4-10 -- drivers/net/ethernet/intel/i40e/
git bisect (old|new) $hash
I verified that cherry-picking f114dca2533ca770aebebffb5ed56e5e7d1fb3fb
on top of v4.9.273 fixes the problem and reverting it again shows the
problem again.
Philipp
--
Philipp Hahn
Open Source Software Engineer
Univention GmbH
be open.
Mary-Somerville-Str. 1
D-28359 Bremen
📞 +49-421-22232-57
🖶 +49-421-22232-99
✉️ hahn(a)univention.de
🌐 https://www.univention.de/
Geschäftsführer: Peter H. Ganten
HRB 20755 Amtsgericht Bremen
Steuer-Nr.: 71-597-02876
Mainline commit ce9f24cccdc0 ("ext4: check journal inode extents more carefully")
enabled validity checks for journal inode's data blocks. This change got
ported to stable branches, but the backport for 4.19 has a bug where it will
flag an error even when system block entry's inode number matches journal
inode.
The way error is reported is also problematic because it updates the superblock
without following journaling rules. This may result in superblock checksum
errors if the superblock is in the process of being committed but has a
previously calculated checksum that doesn't include the bogus error update.
This patch eliminates the bogus error by trying to match how other backports
were implemented, which is to flag an error only when inode numbers mismatch.
Fixes: commit a75a5d163857 ("ext4: check journal inode extents more carefully")
Signed-off-by: Tahsin Erdogan <trdgn(a)amazon.com>
Cc: stable(a)vger.kernel.org
Cc: Jan Kara <jack(a)suse.cz>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/ext4/block_validity.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 1ea8fc9ff048..1bc65ecd4bd6 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -171,8 +171,10 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi,
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
else {
+ if (entry->ino == ino)
+ return 1;
sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
- return entry->ino == ino;
+ return 0;
}
}
return 1;
--
2.17.1
The standard printk() tries to flush the message to the console
immediately. It tries to take the console lock. If the lock is
already taken then the current owner is responsible for flushing
even the new message.
There is a small race window between checking whether a new message is
available and releasing the console lock. It is solved by re-checking
the state after releasing the console lock. If the check is positive
then console_unlock() tries to take the lock again and process the new
message as well.
The commit 996e966640ddea7b535c ("printk: remove logbuf_lock") causes that
console_seq is not longer read atomically. As a result, the re-check might
be done with an inconsistent 64-bit index.
Solve it by using the last sequence number that has been checked under
the console lock. In the worst case, it will take the lock again only
to realized that the new message has already been proceed. But it
was possible even before.
Fixes: commit 996e966640ddea7b535c ("printk: remove logbuf_lock")
Cc: stable(a)vger.kernel.org # 5.13
Signed-off-by: Petr Mladek <pmladek(a)suse.com>
---
kernel/printk/printk.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 142a58d124d9..87411084075e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2545,6 +2545,7 @@ void console_unlock(void)
bool do_cond_resched, retry;
struct printk_info info;
struct printk_record r;
+ u64 next_seq;
if (console_suspended) {
up_console_sem();
@@ -2654,8 +2655,10 @@ void console_unlock(void)
cond_resched();
}
- console_locked = 0;
+ /* Get consistent value of the next-to-be-used sequence number. */
+ next_seq = console_seq;
+ console_locked = 0;
up_console_sem();
/*
@@ -2664,7 +2667,7 @@ void console_unlock(void)
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- retry = prb_read_valid(prb, console_seq, NULL);
+ retry = prb_read_valid(prb, next_seq, NULL);
printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
--
2.26.2
Email: abeltaka54(a)hotmail.com
Telephone: +27837279295
Hello,
My Name is Mr Abel Taka the Chief Operating Officer of Standard Bank of South Africa and i am in need of a reliable foreigner to carry out a $12.2 million deal.
If interested, reply to me for more detailed information.
Thanks.
Abel Taka.
________________________________
*********************ATTENTION*********************
This email and any files transmitted with it are confidential and intended solely for the use of the individual or entity to whom they are addressed. If you are not the named addressee you should not disseminate, distribute or copy this e-mail. Please notify the sender immediately via e-mail if you have received this e-mail by mistake and delete this e-mail from your system. If you are not the intended recipient you are notified that disclosing, copying, distributing or taking any action in reliance on the contents of this information is strictly prohibited.
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
[ Upstream commit 0e793ba77c18382f08e440260fe72bc6fce2a3cb ]
Currently, the SPI core doesn't set the struct device fwnode pointer
when it creates a new SPI device. This means when the device is
registered the fwnode is NULL and the check in device_add which sets
the fwnode->dev pointer is skipped. This wasn't previously an issue,
however these two patches:
commit 4731210c09f5 ("gpiolib: Bind gpio_device to a driver to enable
fw_devlink=on by default")
commit ced2af419528 ("gpiolib: Don't probe gpio_device if it's not the
primary device")
Added some code to the GPIO core which relies on using that
fwnode->dev pointer to determine if a driver is bound to the fwnode
and if not bind a stub GPIO driver. This means the GPIO providers
behind SPI will get both the expected driver and this stub driver
causing the stub driver to fail if it attempts to request any pin
configuration. For example on my system:
madera-pinctrl madera-pinctrl: pin gpio5 already requested by madera-pinctrl; cannot claim for gpiochip3
madera-pinctrl madera-pinctrl: pin-4 (gpiochip3) status -22
madera-pinctrl madera-pinctrl: could not request pin 4 (gpio5) from group aif1 on device madera-pinctrl
gpio_stub_drv gpiochip3: Error applying setting, reverse things back
gpio_stub_drv: probe of gpiochip3 failed with error -22
The firmware node on the device created by the GPIO framework is set
through the of_node pointer hence things generally actually work,
however that fwnode->dev is never set, as the check was skipped at
device_add time. This fix appears to match how the I2C subsystem
handles the same situation.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210421101402.8468-1-ckeepax@opensource.cirrus.c…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index da71a53b0df7..71f74015efb9 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1670,6 +1670,7 @@ of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc)
/* Store a pointer to the node in the device structure */
of_node_get(nc);
spi->dev.of_node = nc;
+ spi->dev.fwnode = of_fwnode_handle(nc);
/* Register the new device */
rc = spi_add_device(spi);
--
2.30.2
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
[ Upstream commit 0e793ba77c18382f08e440260fe72bc6fce2a3cb ]
Currently, the SPI core doesn't set the struct device fwnode pointer
when it creates a new SPI device. This means when the device is
registered the fwnode is NULL and the check in device_add which sets
the fwnode->dev pointer is skipped. This wasn't previously an issue,
however these two patches:
commit 4731210c09f5 ("gpiolib: Bind gpio_device to a driver to enable
fw_devlink=on by default")
commit ced2af419528 ("gpiolib: Don't probe gpio_device if it's not the
primary device")
Added some code to the GPIO core which relies on using that
fwnode->dev pointer to determine if a driver is bound to the fwnode
and if not bind a stub GPIO driver. This means the GPIO providers
behind SPI will get both the expected driver and this stub driver
causing the stub driver to fail if it attempts to request any pin
configuration. For example on my system:
madera-pinctrl madera-pinctrl: pin gpio5 already requested by madera-pinctrl; cannot claim for gpiochip3
madera-pinctrl madera-pinctrl: pin-4 (gpiochip3) status -22
madera-pinctrl madera-pinctrl: could not request pin 4 (gpio5) from group aif1 on device madera-pinctrl
gpio_stub_drv gpiochip3: Error applying setting, reverse things back
gpio_stub_drv: probe of gpiochip3 failed with error -22
The firmware node on the device created by the GPIO framework is set
through the of_node pointer hence things generally actually work,
however that fwnode->dev is never set, as the check was skipped at
device_add time. This fix appears to match how the I2C subsystem
handles the same situation.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210421101402.8468-1-ckeepax@opensource.cirrus.c…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index bbe33016d371..49f592e433a8 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1678,6 +1678,7 @@ of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc)
/* Store a pointer to the node in the device structure */
of_node_get(nc);
spi->dev.of_node = nc;
+ spi->dev.fwnode = of_fwnode_handle(nc);
/* Register the new device */
rc = spi_add_device(spi);
--
2.30.2
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
[ Upstream commit 0e793ba77c18382f08e440260fe72bc6fce2a3cb ]
Currently, the SPI core doesn't set the struct device fwnode pointer
when it creates a new SPI device. This means when the device is
registered the fwnode is NULL and the check in device_add which sets
the fwnode->dev pointer is skipped. This wasn't previously an issue,
however these two patches:
commit 4731210c09f5 ("gpiolib: Bind gpio_device to a driver to enable
fw_devlink=on by default")
commit ced2af419528 ("gpiolib: Don't probe gpio_device if it's not the
primary device")
Added some code to the GPIO core which relies on using that
fwnode->dev pointer to determine if a driver is bound to the fwnode
and if not bind a stub GPIO driver. This means the GPIO providers
behind SPI will get both the expected driver and this stub driver
causing the stub driver to fail if it attempts to request any pin
configuration. For example on my system:
madera-pinctrl madera-pinctrl: pin gpio5 already requested by madera-pinctrl; cannot claim for gpiochip3
madera-pinctrl madera-pinctrl: pin-4 (gpiochip3) status -22
madera-pinctrl madera-pinctrl: could not request pin 4 (gpio5) from group aif1 on device madera-pinctrl
gpio_stub_drv gpiochip3: Error applying setting, reverse things back
gpio_stub_drv: probe of gpiochip3 failed with error -22
The firmware node on the device created by the GPIO framework is set
through the of_node pointer hence things generally actually work,
however that fwnode->dev is never set, as the check was skipped at
device_add time. This fix appears to match how the I2C subsystem
handles the same situation.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210421101402.8468-1-ckeepax@opensource.cirrus.c…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index f8f3434d5ab1..ac05c9c86488 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1849,6 +1849,7 @@ of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc)
/* Store a pointer to the node in the device structure */
of_node_get(nc);
spi->dev.of_node = nc;
+ spi->dev.fwnode = of_fwnode_handle(nc);
/* Register the new device */
rc = spi_add_device(spi);
--
2.30.2
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
[ Upstream commit 0e793ba77c18382f08e440260fe72bc6fce2a3cb ]
Currently, the SPI core doesn't set the struct device fwnode pointer
when it creates a new SPI device. This means when the device is
registered the fwnode is NULL and the check in device_add which sets
the fwnode->dev pointer is skipped. This wasn't previously an issue,
however these two patches:
commit 4731210c09f5 ("gpiolib: Bind gpio_device to a driver to enable
fw_devlink=on by default")
commit ced2af419528 ("gpiolib: Don't probe gpio_device if it's not the
primary device")
Added some code to the GPIO core which relies on using that
fwnode->dev pointer to determine if a driver is bound to the fwnode
and if not bind a stub GPIO driver. This means the GPIO providers
behind SPI will get both the expected driver and this stub driver
causing the stub driver to fail if it attempts to request any pin
configuration. For example on my system:
madera-pinctrl madera-pinctrl: pin gpio5 already requested by madera-pinctrl; cannot claim for gpiochip3
madera-pinctrl madera-pinctrl: pin-4 (gpiochip3) status -22
madera-pinctrl madera-pinctrl: could not request pin 4 (gpio5) from group aif1 on device madera-pinctrl
gpio_stub_drv gpiochip3: Error applying setting, reverse things back
gpio_stub_drv: probe of gpiochip3 failed with error -22
The firmware node on the device created by the GPIO framework is set
through the of_node pointer hence things generally actually work,
however that fwnode->dev is never set, as the check was skipped at
device_add time. This fix appears to match how the I2C subsystem
handles the same situation.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Link: https://lore.kernel.org/r/20210421101402.8468-1-ckeepax@opensource.cirrus.c…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/spi/spi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 0cf67de741e7..bd8b1f79dce2 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2050,6 +2050,7 @@ of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc)
/* Store a pointer to the node in the device structure */
of_node_get(nc);
spi->dev.of_node = nc;
+ spi->dev.fwnode = of_fwnode_handle(nc);
/* Register the new device */
rc = spi_add_device(spi);
--
2.30.2
The mt7915_dpd_freq_idx() function can return a negative value but this
value is assigned to an unsigned variable named idx. Then, the code
tests if this variable is less than zero. This can never happen with an
unsigned type.
So, change the idx type to a signed one.
Addresses-Coverity-ID: 1484753 ("Unsigned compared against 0")
Fixes: 495184ac91bb8 ("mt76: mt7915: add support for applying pre-calibration data")
Signed-off-by: John Wood <john.wood(a)gmx.com>
---
Changelog v1 -> v2
- Add Cc to stable(a)vger.kernel.org
drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index b3f14ff67c5a..764f25a828fa 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -3440,8 +3440,9 @@ int mt7915_mcu_apply_tx_dpd(struct mt7915_phy *phy)
{
struct mt7915_dev *dev = phy->dev;
struct cfg80211_chan_def *chandef = &phy->mt76->chandef;
- u16 total = 2, idx, center_freq = chandef->center_freq1;
+ u16 total = 2, center_freq = chandef->center_freq1;
u8 *cal = dev->cal, *eep = dev->mt76.eeprom.data;
+ int idx;
if (!(eep[MT_EE_DO_PRE_CAL] & MT_EE_WIFI_CAL_DPD))
return 0;
--
2.25.1
If a user program uses userfaultfd on ranges of heap memory, it may
end up passing a tagged pointer to the kernel in the range.start
field of the UFFDIO_REGISTER ioctl. This can happen when using an
MTE-capable allocator, or on Android if using the Tagged Pointers
feature for MTE readiness [1].
When a fault subsequently occurs, the tag is stripped from the fault
address returned to the application in the fault.address field
of struct uffd_msg. However, from the application's perspective,
the tagged address *is* the memory address, so if the application
is unaware of memory tags, it may get confused by receiving an
address that is, from its point of view, outside of the bounds of the
allocation. We observed this behavior in the kselftest for userfaultfd
[2] but other applications could have the same problem.
Address this by not untagging pointers passed to the userfaultfd
ioctls. Instead, let the system call fail. This will provide an
early indication of problems with tag-unaware userspace code instead
of letting the code get confused later, and is consistent with how
we decided to handle brk/mmap/mremap in commit dcde237319e6 ("mm:
Avoid creating virtual address aliases in brk()/mmap()/mremap()"),
as well as being consistent with the existing tagged address ABI
documentation relating to how ioctl arguments are handled.
The code change is a revert of commit 7d0325749a6c ("userfaultfd:
untag user pointers").
[1] https://source.android.com/devices/tech/debug/tagged-pointers
[2] tools/testing/selftests/vm/userfaultfd.c
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0…
Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI")
Cc: <stable(a)vger.kernel.org> # 5.4
---
Documentation/arm64/tagged-address-abi.rst | 25 +++++++++++++++-------
fs/userfaultfd.c | 22 +++++++++----------
2 files changed, 27 insertions(+), 20 deletions(-)
diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst
index 459e6b66ff68..737f9d8565a2 100644
--- a/Documentation/arm64/tagged-address-abi.rst
+++ b/Documentation/arm64/tagged-address-abi.rst
@@ -45,14 +45,23 @@ how the user addresses are used by the kernel:
1. User addresses not accessed by the kernel but used for address space
management (e.g. ``mprotect()``, ``madvise()``). The use of valid
- tagged pointers in this context is allowed with the exception of
- ``brk()``, ``mmap()`` and the ``new_address`` argument to
- ``mremap()`` as these have the potential to alias with existing
- user addresses.
-
- NOTE: This behaviour changed in v5.6 and so some earlier kernels may
- incorrectly accept valid tagged pointers for the ``brk()``,
- ``mmap()`` and ``mremap()`` system calls.
+ tagged pointers in this context is allowed with these exceptions:
+
+ - ``brk()``, ``mmap()`` and the ``new_address`` argument to
+ ``mremap()`` as these have the potential to alias with existing
+ user addresses.
+
+ NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+ incorrectly accept valid tagged pointers for the ``brk()``,
+ ``mmap()`` and ``mremap()`` system calls.
+
+ - The ``range.start`` argument to the ``UFFDIO_REGISTER`` ``ioctl()``
+ used on a file descriptor obtained from ``userfaultfd()``, as
+ fault addresses subsequently obtained by reading the file descriptor
+ will be untagged, which may otherwise confuse tag-unaware programs.
+
+ NOTE: This behaviour changed in v5.14 and so some earlier kernels may
+ incorrectly accept valid tagged pointers for this system call.
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
relaxation is disabled by default and the application thread needs to
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index dd7a6c62b56f..7613efe002c1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 *start, __u64 len)
+ __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- *start = untagged_addr(*start);
-
- if (*start & ~PAGE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (*start < mmap_min_addr)
+ if (start < mmap_min_addr)
return -EINVAL;
- if (*start >= task_size)
+ if (start >= task_size)
return -EINVAL;
- if (len > task_size - *start)
+ if (len > task_size - start)
return -EINVAL;
return 0;
}
@@ -1313,7 +1311,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_MINOR;
}
- ret = validate_range(mm, &uffdio_register.range.start,
+ ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1519,7 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, &uffdio_unregister.start,
+ ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1668,7 +1666,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1708,7 +1706,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1765,7 +1763,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
--
2.32.0.93.g670b81a890-goog
This test passes pointers obtained from anon_allocate_area to the
userfaultfd and mremap APIs. This causes a problem if the system
allocator returns tagged pointers because with the tagged address ABI
the kernel rejects tagged addresses passed to these APIs, which would
end up causing the test to fail. To make this test compatible with
such system allocators, stop using the system allocator to allocate
memory in anon_allocate_area, and instead just use mmap.
Co-developed-by: Lokesh Gidra <lokeshgidra(a)google.com>
Signed-off-by: Lokesh Gidra <lokeshgidra(a)google.com>
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Fixes: c47174fc362a ("userfaultfd: selftest")
Cc: <stable(a)vger.kernel.org> # 5.4
Link: https://linux-review.googlesource.com/id/Icac91064fcd923f77a83e8e133f8631c5…
---
tools/testing/selftests/vm/userfaultfd.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index f5ab5e0312e7..d0f802053dfd 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -197,8 +197,10 @@ static int anon_release_pages(char *rel_area)
static void anon_allocate_area(void **alloc_area)
{
- if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
- fprintf(stderr, "out of memory\n");
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "anon memory mmap failed\n");
*alloc_area = NULL;
}
}
--
2.32.0.93.g670b81a890-goog
Hello,
we recently upgraded the Linux kernel from 5.11.21 to 5.12.12 in our
video stream receiver appliance and noticed compression artifacts on
video streams that were previously looking fine. We are receiving UDP
multicast MPEG TS streams through an FFMpeg / libav layer which does
the socket and lower level protocol handling. For affected kernels it
spills the log with messages like
> [mpegts @ 0x7fa130000900] Packet corrupt (stream = 0, dts = 6870802195).
> [mpegts @ 0x7fa11c000900] Packet corrupt (stream = 0, dts = 6870821068).
Bisecting identified commit 18f25dc399901426dff61e676ba603ff52c666f7
as the one introducing the problem in the mainline kernel. It was
backported to the 5.12 series in
450687386cd16d081b58cd7a342acff370a96078. Some random observations
which may help to understand what's going on:
* the problem exists in Linux 5.13
* reverting that commit on top of 5.13 makes the problem go away
* Linux 5.10.45 is fine
* no relevant output in dmesg
* can be reproduced on different hardware (Intel, AMD, different NICs, ...)
* we do use the bonding driver on the systems (but I did not yet
verify that this is related)
* we do not use vxlan (mentioned in the commit message)
* the relevant code in FFMpeg identifying packet corruption is here:
https://github.com/FFmpeg/FFmpeg/blob/master/libavformat/mpegts.c#L2758
And the bonding configuration:
# cat /proc/net/bonding/bond0
Ethernet Channel Bonding Driver: v5.10.45
Bonding Mode: fault-tolerance (active-backup)
Primary Slave: None
Currently Active Slave: enp2s0
MII Status: up
MII Polling Interval (ms): 100
Up Delay (ms): 0
Down Delay (ms): 0
Peer Notification Delay (ms): 0
Slave Interface: enp2s0
MII Status: up
Speed: 1000 Mbps
Duplex: full
Link Failure Count: 0
Permanent HW addr: 80:ee:73:XX:XX:XX
Slave queue ID: 0
Slave Interface: enp3s0
MII Status: down
Speed: Unknown
Duplex: Unknown
Link Failure Count: 0
Permanent HW addr: 80:ee:73:XX:XX:XX
Slave queue ID: 0
If there is anything else I can do to help tracking this down please
let me know.
Regards,
-Matthias Treydte
From: Jonathan Bell <jonathan(a)raspberrypi.org>
Seen on a VLI VL805 PCIe to USB controller. For non-stream endpoints
at least, if the xHC halts on a particular TRB due to an error then
the DCS field in the Out Endpoint Context maintained by the hardware
is not updated with the current cycle state.
Using the quirk XHCI_EP_CTX_BROKEN_DCS and instead fetch the DCS bit
from the TRB that the xHC stopped on.
Cc: stable(a)vger.kernel.org
Link: https://github.com/raspberrypi/linux/issues/3060
Signed-off-by: Jonathan Bell <jonathan(a)raspberrypi.org>
Signed-off-by: Bjørn Mork <bjorn(a)mork.no>
---
Ran into this issue on an RPi4 running Debian bullseye, having mostly
a plain v5.10.40 kernel. Using an RTL2838 (0bda:2838) with rtl-sdr
just did not work, showing all the issues described on the above link.
This quirk found in https://github.com/raspberrypi/linux.git solves
the problem for me. I don't see why it shouldn't be in mainline. And
I propose adding it to stable as well, since it solves a real problem.
Mostly for my own convenience as I'd prefer just using a Debian built
kernel ;-)
Did not check this submission with Jonathan - hoping it is OK...
Bjørn
drivers/usb/host/xhci-pci.c | 4 +++-
drivers/usb/host/xhci-ring.c | 26 ++++++++++++++++++++++++++
drivers/usb/host/xhci.h | 1 +
3 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 18c2bbddf080..6f3bed09028c 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -279,8 +279,10 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
pdev->device == 0x3432)
xhci->quirks |= XHCI_BROKEN_STREAMS;
- if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483)
+ if (pdev->vendor == PCI_VENDOR_ID_VIA && pdev->device == 0x3483) {
xhci->quirks |= XHCI_LPM_SUPPORT;
+ xhci->quirks |= XHCI_EP_CTX_BROKEN_DCS;
+ }
if (pdev->vendor == PCI_VENDOR_ID_ASMEDIA &&
pdev->device == PCI_DEVICE_ID_ASMEDIA_1042_XHCI)
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 8fea44bbc266..a9c860ff5177 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -559,8 +559,11 @@ static int xhci_move_dequeue_past_td(struct xhci_hcd *xhci,
struct xhci_ring *ep_ring;
struct xhci_command *cmd;
struct xhci_segment *new_seg;
+ struct xhci_segment *halted_seg = NULL;
union xhci_trb *new_deq;
int new_cycle;
+ union xhci_trb *halted_trb;
+ int index = 0;
dma_addr_t addr;
u64 hw_dequeue;
bool cycle_found = false;
@@ -600,6 +603,29 @@ static int xhci_move_dequeue_past_td(struct xhci_hcd *xhci,
new_deq = ep_ring->dequeue;
new_cycle = hw_dequeue & 0x1;
+ /*
+ * Quirk: xHC write-back of the DCS field in the hardware dequeue
+ * pointer is wrong - use the cycle state of the TRB pointed to by
+ * the dequeue pointer.
+ */
+ if (xhci->quirks & XHCI_EP_CTX_BROKEN_DCS &&
+ !(ep->ep_state & EP_HAS_STREAMS))
+ halted_seg = trb_in_td(xhci, cur_td->start_seg,
+ cur_td->first_trb, cur_td->last_trb,
+ hw_dequeue & ~0xf, false);
+ if (halted_seg) {
+ index = ((dma_addr_t)(hw_dequeue & ~0xf) - halted_seg->dma) /
+ sizeof(*halted_trb);
+ halted_trb = &halted_seg->trbs[index];
+ state->new_cycle_state = halted_trb->generic.field[3] & 0x1;
+ xhci_dbg(xhci, "Endpoint DCS = %d TRB index = %d cycle = %d\n",
+ (u8)(hw_dequeue & 0x1), index,
+ state->new_cycle_state);
+ } else {
+ state->new_cycle_state = hw_dequeue & 0x1;
+ }
+ state->stream_id = stream_id;
+
/*
* We want to find the pointer, segment and cycle state of the new trb
* (the one after current TD's last_trb). We know the cycle state at
diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 3c7d281672ae..911aeb7d8a19 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h
@@ -1896,6 +1896,7 @@ struct xhci_hcd {
#define XHCI_SG_TRB_CACHE_SIZE_QUIRK BIT_ULL(39)
#define XHCI_NO_SOFT_RETRY BIT_ULL(40)
#define XHCI_BROKEN_D3COLD BIT_ULL(41)
+#define XHCI_EP_CTX_BROKEN_DCS BIT_ULL(42)
unsigned int num_active_eps;
unsigned int limit_active_eps;
--
2.30.2
Hi Greg,
Linus has taken in a group of mm/thp commits Cc stable today:
504e070dc08f mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split
22061a1ffabd mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()
31657170deaf mm/thp: fix page_address_in_vma() on file THP tails
494334e43c16 mm/thp: fix vma_address() if virtual address below file offset
732ed55823fc mm/thp: try_to_unmap() use TTU_SYNC for safe splitting
3b77e8c8cde5 mm/thp: make is_huge_zero_pmd() safe and quicker
99fa8a48203d mm/thp: fix __split_huge_pmd_locked() on shmem migration entry
ffc90cbb2970 mm, thp: use head page in __migration_entry_wait()
and I expect some more to follow in a few days time (thanks Andrew).
No problem with the commits themselves, but I'm aware that some of them
have dependencies on other commits not yet in stable, which I have to
sort out for you now.
I'd prefer to avoid a deluge of "does not apply" messages, so ask you
please to hold off trying to merge these into stable trees for a few days:
I'll get back to you with what's needed for them to apply.
Thanks,
Hugh
If a user program uses userfaultfd on ranges of heap memory, it may
end up passing a tagged pointer to the kernel in the range.start
field of the UFFDIO_REGISTER ioctl. This can happen when using an
MTE-capable allocator, or on Android if using the Tagged Pointers
feature for MTE readiness [1].
When a fault subsequently occurs, the tag is stripped from the fault
address returned to the application in the fault.address field
of struct uffd_msg. However, from the application's perspective,
the tagged address *is* the memory address, so if the application
is unaware of memory tags, it may get confused by receiving an
address that is, from its point of view, outside of the bounds of the
allocation. We observed this behavior in the kselftest for userfaultfd
[2] but other applications could have the same problem.
Fix this by remembering which tag was used to originally register the
userfaultfd and passing that tag back in fault.address. In a future
enhancement, we may want to pass back the original fault address,
but like SA_EXPOSE_TAGBITS, this should be guarded by a flag.
[1] https://source.android.com/devices/tech/debug/tagged-pointers
[2] tools/testing/selftests/vm/userfaultfd.c
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0…
Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI")
Cc: <stable(a)vger.kernel.org> # 5.4
---
Documentation/arm64/tagged-pointers.rst | 5 +++++
fs/userfaultfd.c | 17 +++++++++++------
include/linux/mm_types.h | 3 ++-
3 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst
index 19d284b70384..ec8e1f90b744 100644
--- a/Documentation/arm64/tagged-pointers.rst
+++ b/Documentation/arm64/tagged-pointers.rst
@@ -73,6 +73,11 @@ flag setting.
Non-zero tags are never preserved in sigcontext.fault_address
regardless of the SA_EXPOSE_TAGBITS flag setting.
+When using userfaultfd the address tag supplied in the range.start
+field of the UFFDIO_REGISTER ioctl is preserved and returned to
+userspace via the fault.address field of struct uffd_msg, and the
+tag of the original fault address is discarded.
+
The architecture prevents the use of a tagged PC, so the upper byte will
be set to a sign-extension of bit 55 on exception return.
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index dd7a6c62b56f..adb0f7d0638a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -110,15 +110,15 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
struct userfaultfd_wake_range *range = key;
int ret;
struct userfaultfd_wait_queue *uwq;
- unsigned long start, len;
+ unsigned long start, len, addr;
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
ret = 0;
/* len == 0 means wake all */
start = range->start;
len = range->len;
- if (len && (start > uwq->msg.arg.pagefault.address ||
- start + len <= uwq->msg.arg.pagefault.address))
+ addr = untagged_addr(uwq->msg.arg.pagefault.address);
+ if (len && (start > addr || start + len <= addr))
goto out;
WRITE_ONCE(uwq->waken, true);
/*
@@ -480,8 +480,9 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
- ctx->features);
+ uwq.msg = userfault_msg(
+ vmf->address + vmf->vma->vm_userfaultfd_ctx.address_tag,
+ vmf->flags, reason, ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -1287,7 +1288,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long vm_flags, new_flags;
bool found;
bool basic_ioctls;
- unsigned long start, end, vma_end;
+ unsigned long address_tag, start, end, vma_end;
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1313,6 +1314,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_MINOR;
}
+ address_tag = uffdio_register.range.start -
+ untagged_addr(uffdio_register.range.start);
+
ret = validate_range(mm, &uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
@@ -1462,6 +1466,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
*/
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx.ctx = ctx;
+ vma->vm_userfaultfd_ctx.address_tag = address_tag;
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
hugetlb_unshare_all_pmds(vma);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8f0fb62e8975..cb93e5b17896 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -286,9 +286,10 @@ struct vm_region {
};
#ifdef CONFIG_USERFAULTFD
-#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, 0, })
struct vm_userfaultfd_ctx {
struct userfaultfd_ctx *ctx;
+ unsigned long address_tag;
};
#else /* CONFIG_USERFAULTFD */
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
--
2.32.0.93.g670b81a890-goog
If perf_event_open() is called with another task as target and
perf_event_attr::sigtrap is set, and the target task's user does not
match the calling user, also require the CAP_KILL capability.
Otherwise, with the CAP_PERFMON capability alone it would be possible
for a user to send SIGTRAP signals via perf events to another user's
tasks. This could potentially result in those tasks being terminated if
they cannot handle SIGTRAP signals.
Note: The check complements the existing capability check, but is not
supposed to supersede the ptrace_may_access() check. At a high level we
now have:
capable of CAP_PERFMON and (CAP_KILL if sigtrap)
OR
ptrace_may_access() // also checks for same thread-group and uid
Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events")
Cc: <stable(a)vger.kernel.org> # 5.13+
Reported-by: Dmitry Vyukov <dvyukov(a)google.com>
Signed-off-by: Marco Elver <elver(a)google.com>
---
v2:
* Drop kill_capable() and just check CAP_KILL (reported by Ondrej Mosnacek).
* Use ns_capable(__task_cred(task)->user_ns, CAP_KILL) to check for
capability in target task's ns (reported by Ondrej Mosnacek).
---
kernel/events/core.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fe88d6eea3c2..43c99695dc3f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12152,10 +12152,23 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (task) {
+ bool is_capable;
+
err = down_read_interruptible(&task->signal->exec_update_lock);
if (err)
goto err_file;
+ is_capable = perfmon_capable();
+ if (attr.sigtrap) {
+ /*
+ * perf_event_attr::sigtrap sends signals to the other
+ * task. Require the current task to have CAP_KILL.
+ */
+ rcu_read_lock();
+ is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
+ rcu_read_unlock();
+ }
+
/*
* Preserve ptrace permission check for backwards compatibility.
*
@@ -12165,7 +12178,7 @@ SYSCALL_DEFINE5(perf_event_open,
* perf_event_exit_task() that could imply).
*/
err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ if (!is_capable && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
goto err_cred;
}
--
2.32.0.93.g670b81a890-goog
From: Eric Biggers <ebiggers(a)google.com>
Add a helper function fscrypt_symlink_getattr() which will be called
from the various filesystems' ->getattr() methods to read and decrypt
the target of encrypted symlinks in order to report the correct st_size.
Detailed explanation:
As required by POSIX and as documented in various man pages, st_size for
a symlink is supposed to be the length of the symlink target.
Unfortunately, st_size has always been wrong for encrypted symlinks
because st_size is populated from i_size from disk, which intentionally
contains the length of the encrypted symlink target. That's slightly
greater than the length of the decrypted symlink target (which is the
symlink target that userspace usually sees), and usually won't match the
length of the no-key encoded symlink target either.
This hadn't been fixed yet because reporting the correct st_size would
require reading the symlink target from disk and decrypting or encoding
it, which historically has been considered too heavyweight to do in
->getattr(). Also historically, the wrong st_size had only broken a
test (LTP lstat03) and there were no known complaints from real users.
(This is probably because the st_size of symlinks isn't used too often,
and when it is, typically it's for a hint for what buffer size to pass
to readlink() -- which a slightly-too-large size still works for.)
However, a couple things have changed now. First, there have recently
been complaints about the current behavior from real users:
- Breakage in rpmbuild:
https://github.com/rpm-software-management/rpm/issues/1682https://github.com/google/fscrypt/issues/305
- Breakage in toybox cpio:
https://www.mail-archive.com/toybox@lists.landley.net/msg07193.html
- Breakage in libgit2: https://issuetracker.google.com/issues/189629152
(on Android public issue tracker, requires login)
Second, we now cache decrypted symlink targets in ->i_link. Therefore,
taking the performance hit of reading and decrypting the symlink target
in ->getattr() wouldn't be as big a deal as it used to be, since usually
it will just save having to do the same thing later.
Also note that eCryptfs ended up having to read and decrypt symlink
targets in ->getattr() as well, to fix this same issue; see
commit 3a60a1686f0d ("eCryptfs: Decrypt symlink target for stat size").
So, let's just bite the bullet, and read and decrypt the symlink target
in ->getattr() in order to report the correct st_size. Add a function
fscrypt_symlink_getattr() which the filesystems will call to do this.
(Alternatively, we could store the decrypted size of symlinks on-disk.
But there isn't a great place to do so, and encryption is meant to hide
the original size to some extent; that property would be lost.)
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/crypto/hooks.c | 44 +++++++++++++++++++++++++++++++++++++++++
include/linux/fscrypt.h | 7 +++++++
2 files changed, 51 insertions(+)
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index a73b0376e6f3..af74599ae1cf 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -384,3 +384,47 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(fscrypt_get_symlink);
+
+/**
+ * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks
+ * @path: the path for the encrypted symlink being queried
+ * @stat: the struct being filled with the symlink's attributes
+ *
+ * Override st_size of encrypted symlinks to be the length of the decrypted
+ * symlink target (or the no-key encoded symlink target, if the key is
+ * unavailable) rather than the length of the encrypted symlink target. This is
+ * necessary for st_size to match the symlink target that userspace actually
+ * sees. POSIX requires this, and some userspace programs depend on it.
+ *
+ * This requires reading the symlink target from disk if needed, setting up the
+ * inode's encryption key if possible, and then decrypting or encoding the
+ * symlink target. This makes lstat() more heavyweight than is normally the
+ * case. However, decrypted symlink targets will be cached in ->i_link, so
+ * usually the symlink won't have to be read and decrypted again later if/when
+ * it is actually followed, readlink() is called, or lstat() is called again.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = d_inode(dentry);
+ const char *link;
+ DEFINE_DELAYED_CALL(done);
+
+ /*
+ * To get the symlink target that userspace will see (whether it's the
+ * decrypted target or the no-key encoded target), we can just get it in
+ * the same way the VFS does during path resolution and readlink().
+ */
+ link = READ_ONCE(inode->i_link);
+ if (!link) {
+ link = inode->i_op->get_link(dentry, inode, &done);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+ }
+ stat->size = strlen(link);
+ do_delayed_call(&done);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 2ea1387bb497..b7bfd0cd4f3e 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -253,6 +253,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
unsigned int max_size,
struct delayed_call *done);
+int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat);
static inline void fscrypt_set_ops(struct super_block *sb,
const struct fscrypt_operations *s_cop)
{
@@ -583,6 +584,12 @@ static inline const char *fscrypt_get_symlink(struct inode *inode,
return ERR_PTR(-EOPNOTSUPP);
}
+static inline int fscrypt_symlink_getattr(const struct path *path,
+ struct kstat *stat)
+{
+ return -EOPNOTSUPP;
+}
+
static inline void fscrypt_set_ops(struct super_block *sb,
const struct fscrypt_operations *s_cop)
{
--
2.32.0
From: Paul Burton <paulburton(a)google.com>
Currently tgid_map is sized at PID_MAX_DEFAULT entries, which means that
on systems where pid_max is configured higher than PID_MAX_DEFAULT the
ftrace record-tgid option doesn't work so well. Any tasks with PIDs
higher than PID_MAX_DEFAULT are simply not recorded in tgid_map, and
don't show up in the saved_tgids file.
In particular since systemd v243 & above configure pid_max to its
highest possible 1<<22 value by default on 64 bit systems this renders
the record-tgids option of little use.
Increase the size of tgid_map to the configured pid_max instead,
allowing it to cover the full range of PIDs up to the maximum value of
PID_MAX_LIMIT if the system is configured that way.
On 64 bit systems with pid_max == PID_MAX_LIMIT this will increase the
size of tgid_map from 256KiB to 16MiB. Whilst this 64x increase in
memory overhead sounds significant 64 bit systems are presumably best
placed to accommodate it, and since tgid_map is only allocated when the
record-tgid option is actually used presumably the user would rather it
spends sufficient memory to actually record the tgids they expect.
The size of tgid_map could also increase for CONFIG_BASE_SMALL=y
configurations, but these seem unlikely to be systems upon which people
are both configuring a large pid_max and running ftrace with record-tgid
anyway.
Of note is that we only allocate tgid_map once, the first time that the
record-tgid option is enabled. Therefore its size is only set once, to
the value of pid_max at the time the record-tgid option is first
enabled. If a user increases pid_max after that point, the saved_tgids
file will not contain entries for any tasks with pids beyond the earlier
value of pid_max.
Link: https://lkml.kernel.org/r/20210701172407.889626-2-paulburton@google.com
Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks")
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joel Fernandes <joelaf(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Paul Burton <paulburton(a)google.com>
[ Fixed comment coding style ]
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace.c | 63 +++++++++++++++++++++++++++++++++-----------
1 file changed, 47 insertions(+), 16 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4843076d67d3..14f56e9fa001 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2191,8 +2191,15 @@ void tracing_reset_all_online_cpus(void)
}
}
+/*
+ * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
+ * is the tgid last observed corresponding to pid=i.
+ */
static int *tgid_map;
+/* The maximum valid index into tgid_map. */
+static size_t tgid_map_max;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -2468,24 +2475,41 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
+static int *trace_find_tgid_ptr(int pid)
+{
+ /*
+ * Pairs with the smp_store_release in set_tracer_flag() to ensure that
+ * if we observe a non-NULL tgid_map then we also observe the correct
+ * tgid_map_max.
+ */
+ int *map = smp_load_acquire(&tgid_map);
+
+ if (unlikely(!map || pid > tgid_map_max))
+ return NULL;
+
+ return &map[pid];
+}
+
int trace_find_tgid(int pid)
{
- if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
- return 0;
+ int *ptr = trace_find_tgid_ptr(pid);
- return tgid_map[pid];
+ return ptr ? *ptr : 0;
}
static int trace_save_tgid(struct task_struct *tsk)
{
+ int *ptr;
+
/* treat recording of idle task as a success */
if (!tsk->pid)
return 1;
- if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
+ ptr = trace_find_tgid_ptr(tsk->pid);
+ if (!ptr)
return 0;
- tgid_map[tsk->pid] = tsk->tgid;
+ *ptr = tsk->tgid;
return 1;
}
@@ -5225,6 +5249,8 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
+ int *map;
+
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD))
lockdep_assert_held(&event_mutex);
@@ -5247,10 +5273,19 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
trace_event_enable_cmd_record(enabled);
if (mask == TRACE_ITER_RECORD_TGID) {
- if (!tgid_map)
- tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
- sizeof(*tgid_map),
- GFP_KERNEL);
+ if (!tgid_map) {
+ tgid_map_max = pid_max;
+ map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
+ GFP_KERNEL);
+
+ /*
+ * Pairs with smp_load_acquire() in
+ * trace_find_tgid_ptr() to ensure that if it observes
+ * the tgid_map we just allocated then it also observes
+ * the corresponding tgid_map_max value.
+ */
+ smp_store_release(&tgid_map, map);
+ }
if (!tgid_map) {
tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
return -ENOMEM;
@@ -5664,18 +5699,14 @@ static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
int pid = ++(*pos);
- if (pid > PID_MAX_DEFAULT)
- return NULL;
-
- return &tgid_map[pid];
+ return trace_find_tgid_ptr(pid);
}
static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
- if (!tgid_map || *pos > PID_MAX_DEFAULT)
- return NULL;
+ int pid = *pos;
- return &tgid_map[*pos];
+ return trace_find_tgid_ptr(pid);
}
static void saved_tgids_stop(struct seq_file *m, void *v)
--
2.30.2
From: Paul Burton <paulburton(a)google.com>
The tgid_map array records a mapping from pid to tgid, where the index
of an entry within the array is the pid & the value stored at that index
is the tgid.
The saved_tgids_next() function iterates over pointers into the tgid_map
array & dereferences the pointers which results in the tgid, but then it
passes that dereferenced value to trace_find_tgid() which treats it as a
pid & does a further lookup within the tgid_map array. It seems likely
that the intent here was to skip over entries in tgid_map for which the
recorded tgid is zero, but instead we end up skipping over entries for
which the thread group leader hasn't yet had its own tgid recorded in
tgid_map.
A minimal fix would be to remove the call to trace_find_tgid, turning:
if (trace_find_tgid(*ptr))
into:
if (*ptr)
..but it seems like this logic can be much simpler if we simply let
seq_read() iterate over the whole tgid_map array & filter out empty
entries by returning SEQ_SKIP from saved_tgids_show(). Here we take that
approach, removing the incorrect logic here entirely.
Link: https://lkml.kernel.org/r/20210630003406.4013668-1-paulburton@google.com
Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks")
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joel Fernandes <joelaf(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Paul Burton <paulburton(a)google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace.c | 38 +++++++++++++-------------------------
1 file changed, 13 insertions(+), 25 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60492464281e..4843076d67d3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5662,37 +5662,20 @@ static const struct file_operations tracing_readme_fops = {
static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
- int *ptr = v;
+ int pid = ++(*pos);
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
- if (trace_find_tgid(*ptr))
- return ptr;
- }
+ if (pid > PID_MAX_DEFAULT)
+ return NULL;
- return NULL;
+ return &tgid_map[pid];
}
static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
- void *v;
- loff_t l = 0;
-
- if (!tgid_map)
+ if (!tgid_map || *pos > PID_MAX_DEFAULT)
return NULL;
- v = &tgid_map[0];
- while (l <= *pos) {
- v = saved_tgids_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
+ return &tgid_map[*pos];
}
static void saved_tgids_stop(struct seq_file *m, void *v)
@@ -5701,9 +5684,14 @@ static void saved_tgids_stop(struct seq_file *m, void *v)
static int saved_tgids_show(struct seq_file *m, void *v)
{
- int pid = (int *)v - tgid_map;
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
+
+ if (tgid == 0)
+ return SEQ_SKIP;
- seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ seq_printf(m, "%d %d\n", pid, tgid);
return 0;
}
--
2.30.2
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 4.14.238 kernel.
All users of the 4.14 kernel series must upgrade.
The updated 4.14.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.14.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDcdGkACgkQ3qZv95d3
LNycCg/+KmyrChXSyZIUeUT5UVNeEEaR1zjJLORWvuHehW9Q4hcnvRlXuEGO7q5g
U+8XHm8H+hIjkwfpLmim1Jn6hMTx9P8fZ0t45YXkkXmPBoMCSySEiPpAKMaDQPxs
EU5ULrkNtTXiengdR6w13ayuSMSIacPyXFmY20OdzAnhtiXwgv5s9HgRDkcDZomh
M/Fqux6b16fXDS12qUdI7RbNUyJnWkBOm9KpE/zAzyMQlj0r/NUs1T02JS8/gWww
SfwgECLfvoFPNuxXI2Y9WEKQ40xx6Hb/Fzatvs18WjwLC+SvUfwPKlOyP6sogq+N
2kn7eFygkZzyDCL8GYv3ZVd/O8Km4kMWWthehJ/SD6MEzbIVlZmjCISivYA9fZVf
rLqWAdymiRDhJqak1pwsW8fVOBxJJYLMUJy3tv5Zjcg1bBWPrE4VufsntZt9YVdr
evpLVKeOU8p6aCdy7FwN+b/dBPriZt6oesNkhO3OMfW73FBesp8bgdH4Rhs9ISkv
lXb0mjYmE2ZJ6S+vKnRuHVDoiOc8u9fZnQqrCwNzI0QFltCYU9AZGI3cbmVH5a/h
/1TFCC0uVpwWquFuszkfvyItjFRpZhhjYsMZOB7jzCR/EDJakx2S5qsCMNW3bdQt
W5emHwENNkmlLEuRQwuoAbwfOPqV8IK9nO5goh0Sg4G45OzXU20=
=KrR2
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 4.19.196 kernel.
All users of the 4.19 kernel series must upgrade.
The updated 4.19.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.19.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDcdF8ACgkQ3qZv95d3
LNyFdxAAnB9VMQ9XCMIW0KFIuc84l34tHe7bexocQHEGxWFhsJEnyNzGGcZxlY7G
XKlHXzh7QWPWuf82jt7fNSrctyAO6Kun3VF5ucGsixgCStb+0byHHL6F4N9eEdPH
v2h8HA46OTGShHBLsMsFsLLVY335WNSz+YnI3tXmHuMcgckUPDYoS2Zh0GFJTrsF
jW4YCheGMtcJHU80D8pUyEucb4GdyasNHkwW2d5vn7PhKNtr4Tv2rRjF2/EHWdog
RZ18hHzJRI/DwGrVwLXy8hcaVG8qp7b1Se5lRIZIFaYxRjXs7vpqFngabXoq8K1l
Q2OaQNvlKVDNBJnnSumu1mFxZgWdixQ3i9qLVhSvS+yAilP45cHZkkGee+6uchJE
vJc4WVV08NTfUTbPBMdCkHCfLRai06qdkOxf9l4YX5Phs86gi6SCbaKZMA5mUpWr
bw9KJi5UgQLYfETzagJmncCm+BqGEnVzMGn98kUgAcxLWIMerEg7AaIL0Jul0Gig
GElwsTo1O4byu8Ee0sF9nppW6iB+FlBqOVepwy+d7RocaihKyssm87Gwvoy+Py8b
4yH1MGFGP2Un0SyKGUz7t3RY/hD8glmILHs/l+1UTeb8RkgZDy3iw+Cbrcu0CWQq
JM86nlanh0UyklQ2lM51DIZqSWs5J6BHyJyuGNs6CrbjltlGbHo=
=TGQ2
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 5.4.129 kernel.
All users of the 5.4 kernel series must upgrade.
The updated 5.4.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.4.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDcdE8ACgkQ3qZv95d3
LNwMew/9HLxjRRYhe6jCrc66+H2ekkh7TqhkWrgr4zEzYg2k4Xb34MKsq6/jowkw
BuAQdY2R1Hrg3UgjmXMnYw6aCVOyOFbbsmK1A07CjW36a4IFrKaKLmdL0nKJl5Vr
V+y2v8f9uJfJ9ceN6VdUR7hH1bJLRuoTv0klKOBcGEvvInZiZ2qxzLujVCWYD8CF
a8xwvtzt/QWAsPrfMFPa6voLZkLPfNu5sYiqJsboXyt7a9XJYQZ1sJBDfb3HK+uo
c0rPcrbQTthmjYmmfFpU6EfBpRBqHui3aWPOCR4OFRtw/1KcB3NIUqkNyTLXxIWB
7OSfLESDXTKCd+RX+tctOCaIaMtRWT/o7RqQy/knI6VXpJk42o/w9tmMVxIU1zr3
v+kqJd7wQ/B03zei4XhZBTnPhT2tOFNtMFzYvagGyJXR9u0UjDK4Jf8i8ppFOJD5
USkl54p8xzHqnoSHV4SidmiQr6adUlnZhJFcr/0ODTd74+7+08KjhIiNbO+HY0lx
EKrW7lmcXCbr5zvHHJMODbYkgYf0iQ/RawRxU/VZ4GQzFT+92ep0qKENDcDTmE8F
9OBIgdOw6kt7p41LM/H6XxHQMLRnVne/2btUxmg7nWfESTyn+2iPp749X2qhuPeW
ufUQVGqajlQIklGVVcG0sHxfOFjvZEqNQfqEdcP5mK3g0RoIhPw=
=k7Vi
-----END PGP SIGNATURE-----
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
I'm announcing the release of the 5.10.47 kernel.
All users of the 5.10 kernel series must upgrade.
The updated 5.10.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-5.10.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
Thanks,
Sasha
-----BEGIN PGP SIGNATURE-----
iQIzBAEBCgAdFiEE4n5dijQDou9mhzu83qZv95d3LNwFAmDcdEIACgkQ3qZv95d3
LNyCpw/+K96+1ahen7kcs2rt3783nti4S33dpn7vKXeD6B2i5IhdqlFk3aCDqshh
aMoy8kNgtXb90GdgPobfWQGJt+1MMBfgxDd38VhqdBovOM1JNkCrVfrrv1lPmw/C
QUAiwxHeOd7why2UUMJaXb7xsAv/ircK+zi5sVImpf6NCgXeKaJOB74kFYH7VI+R
6LRPBWuOpDc3As1I1MOoC0tIXWI7YCictecr1LTDi75REu58x64ty0HN/b6Gj/Io
Y3EGlTe8GRIsChDAArYScCTYCixyN2pj/Loc7vlZUCHb3puQShO4bSNyPsykYxfR
HMB5F5Jf1HaKN0im5rel1sKd2hn0/tWwla8orRIWubXhBPrSxqsJJn642h2o7ZZN
8axd1K8Gd+zpqHZjjl4mYtcJo3A7Cj71r9XGVmfVMowTLs5wiX/30h98PfevlGGv
mj+ybjue3Gypk3ZTaHRifRLDh5KzTJNSMxm1YJcJ7IhsTBsgQXDMuInSgkSG32yz
Ggk/xj+GU4ob43EU92hSc4Cbh6zSUnQ2ac5vQAeKyYqKtAmEGL+hCf3Mnatk0ItS
UUnwXPflRB1eCbI3JFYZ5A0Igp+60WMARjRWb/MbWX1kwMOKlZKl9fUjfeJV6b3e
xURPJruLnZelsIXS8L9Nx1SWu51HnVhhf7D/58CDByqnBCq6oaQ=
=b4hI
-----END PGP SIGNATURE-----
The tgid_map array records a mapping from pid to tgid, where the index
of an entry within the array is the pid & the value stored at that index
is the tgid.
The saved_tgids_next() function iterates over pointers into the tgid_map
array & dereferences the pointers which results in the tgid, but then it
passes that dereferenced value to trace_find_tgid() which treats it as a
pid & does a further lookup within the tgid_map array. It seems likely
that the intent here was to skip over entries in tgid_map for which the
recorded tgid is zero, but instead we end up skipping over entries for
which the thread group leader hasn't yet had its own tgid recorded in
tgid_map.
A minimal fix would be to remove the call to trace_find_tgid, turning:
if (trace_find_tgid(*ptr))
into:
if (*ptr)
..but it seems like this logic can be much simpler if we simply let
seq_read() iterate over the whole tgid_map array & filter out empty
entries by returning SEQ_SKIP from saved_tgids_show(). Here we take that
approach, removing the incorrect logic here entirely.
Signed-off-by: Paul Burton <paulburton(a)google.com>
Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks")
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Joel Fernandes <joelaf(a)google.com>
Cc: <stable(a)vger.kernel.org>
---
kernel/trace/trace.c | 38 +++++++++++++-------------------------
1 file changed, 13 insertions(+), 25 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d23a09d3eb37b..9570667310bcc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5608,37 +5608,20 @@ static const struct file_operations tracing_readme_fops = {
static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
{
- int *ptr = v;
+ int pid = ++(*pos);
- if (*pos || m->count)
- ptr++;
-
- (*pos)++;
-
- for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
- if (trace_find_tgid(*ptr))
- return ptr;
- }
+ if (pid > PID_MAX_DEFAULT)
+ return NULL;
- return NULL;
+ return &tgid_map[pid];
}
static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
{
- void *v;
- loff_t l = 0;
-
- if (!tgid_map)
+ if (!tgid_map || *pos > PID_MAX_DEFAULT)
return NULL;
- v = &tgid_map[0];
- while (l <= *pos) {
- v = saved_tgids_next(m, v, &l);
- if (!v)
- return NULL;
- }
-
- return v;
+ return &tgid_map[*pos];
}
static void saved_tgids_stop(struct seq_file *m, void *v)
@@ -5647,9 +5630,14 @@ static void saved_tgids_stop(struct seq_file *m, void *v)
static int saved_tgids_show(struct seq_file *m, void *v)
{
- int pid = (int *)v - tgid_map;
+ int *entry = (int *)v;
+ int pid = entry - tgid_map;
+ int tgid = *entry;
+
+ if (tgid == 0)
+ return SEQ_SKIP;
- seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ seq_printf(m, "%d %d\n", pid, tgid);
return 0;
}
base-commit: 62fb9874f5da54fdb243003b386128037319b219
--
2.32.0.93.g670b81a890-goog
Hello,
I am encountering an issue where my system freezes entirely after a
random but short time after boot-up: the computer does not react to any
user input, be it mouse, keyboard or plugging-unplugging peripherals.
The image on screen remains still and does not go black. Any sound that
was playing at the moment of the freeze loops indefinitely with a period
of around 1s. I have the intuition that the issue is sound related
because of that.
Some additional information with this issue
- Only happens only with the latest stable releases of the kernel:
5.12.9 and 5.12.10 . 5.12.8 does not have this issue. I have not
tested other kernel version, e.g. LTS 5.10 , to see if it's a change
that got back-ported.
- Happens in Gentoo and archlinux with their respective official kernel
binary release. But also self built, with Archlinux's upstream .config
file and also stripped down versions (please find an example .config
attached as config-stripped). Happens on Gnome on Wayland and Xorg and
on LXQt (Openbox).
- The output of `journalctl` stops before the freeze happens, I suppose
it's because nothing can be saved in the disk when it happens. Please
find the output of journalctl for a boot where the freeze happens
attached to this mail as `journaloutput`. This should give all the
necessary information about my system. I just realized that I have extra
boot kernel parameters, maybe removing the extra ones works around the
issue and would help pinpoint the issue. I will report back if I have
any extra information
Given the above information I believe this issue is best reported to
you. I unfortunately do not have more information to report for proper
pinpointing and I am willing to work from your directed feedback. My
kernel knowledge is limited but I can probably deal with technical
requests from you given enough time and documentation.
Thank you for your help and I hope I am not wasting your time with
something
Kind regards,
Adel KARA SLIMANE
I noticed that the reverts below were made to the 4.9 and 5.4
branches, but I do not see them elsewhere (other stable branches,
latest, etc). The original commits which were subsequently reverted on
the 4.9 and 5.4 branches were causing our cable-modem drivers memremap
calls to fail so we need these reverted everywhere like they are on
the 4.9 and 5.4 branches. Is that the plan?
-Tim
ommit 6b183fbf18b91bc3c1fd02d5a48f7bc447d900cedrivers/of/fdt.c
Author: Quentin Perret <qperret(a)google.com>
Date: Wed May 12 12:28:53 2021 +0000
Revert "fdt: Properly handle "no-map" field in the memory region"
This reverts commit fb326c6ce0dcbb6273202c6e012759754ec8538d.
It is not really a fix, and the backport misses dependencies, which
breaks existing platforms.
Reported-by: Alexandre TORGUE <alexandre.torgue(a)foss.st.com>
Signed-off-by: Quentin Perret <qperret(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
commit 66b8853dfa3cfbbe6c3ab643b6989377ad16662a
Author: Quentin Perret <qperret(a)google.com>
Date: Wed May 12 12:28:52 2021 +0000
Revert "of/fdt: Make sure no-map does not remove already reserved regions"
This reverts commit 3cbd3038c9155038020560729cde50588311105d.
It is not really a fix, and the backport misses dependencies, which
breaks existing platforms.
Reported-by: Alexandre TORGUE <alexandre.torgue(a)foss.st.com>
Signed-off-by: Quentin Perret <qperret(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
commit 3cbd3038c9155038020560729cde50588311105d
Author: Nicolas Boichat <drinkcat(a)chromium.org>
Date: Fri Jan 15 11:45:44 2021 +0000
of/fdt: Make sure no-map does not remove already reserved regions
[ Upstream commit 8a5a75e5e9e55de1cef5d83ca3589cb4899193ef ]
If the device tree is incorrectly configured, and attempts to
define a "no-map" reserved memory that overlaps with the kernel
data/code, the kernel would crash quickly after boot, with no
obvious clue about the nature of the issue.
For example, this would happen if we have the kernel mapped at
these addresses (from /proc/iomem):
40000000-41ffffff : System RAM
40080000-40dfffff : Kernel code
40e00000-411fffff : reserved
41200000-413e0fff : Kernel data
And we declare a no-map shared-dma-pool region at a fixed address
within that range:
mem_reserved: mem_region {
compatible = "shared-dma-pool";
reg = <0 0x40000000 0 0x01A00000>;
no-map;
};
To fix this, when removing memory regions at early boot (which is
what "no-map" regions do), we need to make sure that the memory
is not already reserved. If we do, __reserved_mem_reserve_reg
will throw an error:
[ 0.000000] OF: fdt: Reserved memory: failed to reserve memory
for node 'mem_region': base 0x0000000040000000, size 26 MiB
and the code that will try to use the region should also fail,
later on.
We do not do anything for non-"no-map" regions, as memblock
explicitly allows reserved regions to overlap, and the commit
that this fixes removed the check for that precise reason.
[ qperret: fixed conflicts caused by the usage of memblock_mark_nomap ]
Fixes: 094cb98179f19b7 ("of/fdt: memblock_reserve /memreserve/
regions in the case of partial overlap")
Signed-off-by: Nicolas Boichat <drinkcat(a)chromium.org>
Reviewed-by: Stephen Boyd <swboyd(a)chromium.org>
Signed-off-by: Quentin Perret <qperret(a)google.com>
Link: https://lore.kernel.org/r/20210115114544.1830068-3-qperret@google.com
Signed-off-by: Rob Herring <robh(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
commit fb326c6ce0dcbb6273202c6e012759754ec8538d
Author: KarimAllah Ahmed <karahmed(a)amazon.de>
Date: Fri Jan 15 11:45:43 2021 +0000
fdt: Properly handle "no-map" field in the memory region
[ Upstream commit 86588296acbfb1591e92ba60221e95677ecadb43 ]
Mark the memory region with NOMAP flag instead of completely removing it
from the memory blocks. That makes the FDT handling consistent with the EFI
memory map handling.
Cc: Rob Herring <robh+dt(a)kernel.org>
Cc: Frank Rowand <frowand.list(a)gmail.com>
Cc: devicetree(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: Quentin Perret <qperret(a)google.com>
Link: https://lore.kernel.org/r/20210115114544.1830068-2-qperret@google.com
Signed-off-by: Rob Herring <robh(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
--
This electronic communication and the information and any files transmitted
with it, or attached to it, are confidential and are intended solely for
the use of the individual or entity to whom it is addressed and may contain
information that is confidential, legally privileged, protected by privacy
laws, or otherwise restricted from disclosure to anyone else. If you are
not the intended recipient or the person responsible for delivering the
e-mail to the intended recipient, you are hereby notified that any use,
copying, distributing, dissemination, forwarding, printing, or copying of
this e-mail is strictly prohibited. If you received this e-mail in error,
please return the e-mail to the sender, delete it from your computer, and
destroy any printed copy of it.
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
All internal use cases for tracepoint_probe_register() is set to not ever
be called with the same function and data. If it is, it is considered a
bug, as that means the accounting of handling tracepoints is corrupted.
If the function and data for a tracepoint is already registered when
tracepoint_probe_register() is called, it will call WARN_ON_ONCE() and
return with EEXISTS.
The BPF system call can end up calling tracepoint_probe_register() with
the same data, which now means that this can trigger the warning because
of a user space process. As WARN_ON_ONCE() should not be called because
user space called a system call with bad data, there needs to be a way to
register a tracepoint without triggering a warning.
Enter tracepoint_probe_register_may_exist(), which can be called, but will
not cause a WARN_ON() if the probe already exists. It will still error out
with EEXIST, which will then be sent to the user space that performed the
BPF system call.
This keeps the previous testing for issues with other users of the
tracepoint code, while letting BPF call it with duplicated data and not
warn about it.
Link: https://lore.kernel.org/lkml/20210626135845.4080-1-penguin-kernel@I-love.SA…
Link: https://syzkaller.appspot.com/bug?id=41f4318cf01762389f4d1c1c459da4f542fe51…
Cc: stable(a)vger.kernel.org
Fixes: c4f6699dfcb85 ("bpf: introduce BPF_RAW_TRACEPOINT")
Reported-by: syzbot <syzbot+721aa903751db87aa244(a)syzkaller.appspotmail.com>
Reported-by: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
Tested-by: syzbot+721aa903751db87aa244(a)syzkaller.appspotmail.com
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
include/linux/tracepoint.h | 10 ++++++++++
kernel/trace/bpf_trace.c | 3 ++-
kernel/tracepoint.c | 33 ++++++++++++++++++++++++++++++---
3 files changed, 42 insertions(+), 4 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 13f65420f188..ab58696d0ddd 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -41,7 +41,17 @@ extern int
tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data,
int prio);
extern int
+tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data,
+ int prio);
+extern int
tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data);
+static inline int
+tracepoint_probe_register_may_exist(struct tracepoint *tp, void *probe,
+ void *data)
+{
+ return tracepoint_probe_register_prio_may_exist(tp, probe, data,
+ TRACEPOINT_DEFAULT_PRIO);
+}
extern void
for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
void *priv);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 7a52bc172841..f0568b3d6bd1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1840,7 +1840,8 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
+ prog);
}
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9f478d29b926..976bf8ce8039 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -273,7 +273,8 @@ static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
- struct tracepoint_func *func, int prio)
+ struct tracepoint_func *func, int prio,
+ bool warn)
{
struct tracepoint_func *old, *tp_funcs;
int ret;
@@ -288,7 +289,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
lockdep_is_held(&tracepoints_mutex));
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
@@ -343,6 +344,32 @@ static int tracepoint_remove_func(struct tracepoint *tp,
return 0;
}
+/**
+ * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ * @prio: priority of this function over other registered functions
+ *
+ * Same as tracepoint_probe_register_prio() except that it will not warn
+ * if the tracepoint is already registered.
+ */
+int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe,
+ void *data, int prio)
+{
+ struct tracepoint_func tp_func;
+ int ret;
+
+ mutex_lock(&tracepoints_mutex);
+ tp_func.func = probe;
+ tp_func.data = data;
+ tp_func.prio = prio;
+ ret = tracepoint_add_func(tp, &tp_func, prio, false);
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist);
+
/**
* tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority
* @tp: tracepoint
@@ -366,7 +393,7 @@ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
tp_func.func = probe;
tp_func.data = data;
tp_func.prio = prio;
- ret = tracepoint_add_func(tp, &tp_func, prio);
+ ret = tracepoint_add_func(tp, &tp_func, prio, true);
mutex_unlock(&tracepoints_mutex);
return ret;
}
--
2.30.2