On Sun, Nov 19, 2023 at 08:12:48PM -0500, Stefan Berger wrote:
> Documentation/filesystems/ramfs-rootfs-initramfs.rst states:
>
> If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by
> default. To force ramfs, add "rootfstype=ramfs" to the kernel command
> line.
>
> This currently does not work when root= is provided since then
> saved_root_name contains a string and rootfstype= is ignored. Therefore,
> ramfs is currently always chosen when …
[View More]root= is provided.
Maybe it is a good idea to just fully remove ramfs? initramfs will
always be tmpfs. And tmpfs will always be enabled.
As well as I understand, ramfs was originally introduced, because
tmpfs seemed too big. So, it seemed to be a good idea to have small fs
(ramfs), which is always enabled.
I just did an experiment. I compiled the kernel with a very small
config. And without TMPFS and SHMEM. I got 1059440 bytes image. Then I
enabled TMPFS and SHMEM, and I got 1072976 bytes. So tmpfs adds 13536
bytes, i. e. 14k, which is a very small amount. It adds 1.3 % to the
kernel even with very small config.
So I propose to remove ramfs and always enable tmpfs. This will
decrease complexity.
Here are my configs (x86_64). Just enough to run busybox in "qemu -serial stdio"
make KCONFIG_ALLCONFIG="$FILE" allnoconfig
CONFIG_64BIT=y
CONFIG_PRINTK=y
CONFIG_SERIAL_8250=y
CONFIG_TTY=y
CONFIG_SERIAL_8250_CONSOLE=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_RD_GZIP=y
CONFIG_BINFMT_ELF=y
CONFIG_EMBEDDED=y
CONFIG_EXPERT=y
CONFIG_TMPFS=n # Try to change this to "y"
CONFIG_SHMEM=n # Try to change this to "y"
Here is full docker reproducer:
# Reproducible
# 20230227 = 20230227T000000Z = 20230226T090712Z
FROM debian:sid-20230227
ENV LC_ALL C.UTF-8
RUN sed -i 's~^URIs:.*$~URIs:
http://snapshot.debian.org/archive/debian/20230226T090712Z~'
/etc/apt/sources.list.d/debian.sources
RUN echo 'Acquire::Check-Valid-Until "false";' >
/etc/apt/apt.conf.d/02acquire-check-valid-until
RUN apt-get update && apt-get install -y apt-utils whiptail
RUN apt-get update && apt-get install -y busybox-static
qemu-system-x86 make gcc git flex bison bc libelf-dev less nano cpio
RUN git clone --depth=1 -b v6.2
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
RUN : > /.config
# See Linux f8f0d06438e5c810d1e13b5f8c2fed501fe36e9c
RUN echo 'CONFIG_64BIT=y' >> /.config
RUN echo 'CONFIG_PRINTK=y' >> /.config
RUN echo 'CONFIG_SERIAL_8250=y' >> /.config
RUN echo 'CONFIG_TTY=y' >> /.config
RUN echo 'CONFIG_SERIAL_8250_CONSOLE=y' >> /.config
RUN echo 'CONFIG_BLK_DEV_INITRD=y' >> /.config
RUN echo 'CONFIG_RD_GZIP=y' >> /.config
RUN echo 'CONFIG_BINFMT_ELF=y' >> /.config
RUN echo 'CONFIG_EMBEDDED=y' >> /.config
RUN echo 'CONFIG_EXPERT=y' >> /.config
RUN echo 'CONFIG_TMPFS=y' >> /.config # try "n"
RUN echo 'CONFIG_SHMEM=y' >> /.config # try "n"
RUN cd linux && make KCONFIG_ALLCONFIG=/.config allnoconfig
RUN cd linux && make -j4
RUN mkdir /initramfs && cp /bin/busybox /initramfs && cd /initramfs &&
ln -s busybox sh && find . | cpio --create --format=newc --quiet |
gzip > /initramfs.cpio.gz
RUN echo "qemu-system-x86_64 -M microvm -m 64M -serial stdio -display
none -kernel /linux/arch/x86/boot/bzImage -initrd /initramfs.cpio.gz
-append 'quiet console=ttyS0 earlyprintk=ttyS0 rdinit=/sh' -nodefaults
-no-user-config" > /root/.bash_history
--
Askar Safin
[View Less]
After updating bb_free in mb_free_blocks, it is possible to return without
updating bb_fragments because the block being freed is found to have
already been freed, which leads to inconsistency between bb_free and
bb_fragments.
Since the group may be unlocked in ext4_grp_locked_error(), this can lead
to problems such as dividing by zero when calculating the average fragment
length. Hence move the update of bb_free to after the block double-free
check guarantees that the corresponding statistics …
[View More]are updated only after
the core block bitmap is modified.
Fixes: eabe0444df90 ("ext4: speed-up releasing blocks on commit")
CC: stable(a)vger.kernel.org # 3.10
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/mballoc.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f6131ba514c8..1f15774971d7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1910,11 +1910,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
- this_cpu_inc(discard_pa_seq);
- e4b->bd_info->bb_free += count;
- if (first < e4b->bd_info->bb_first_free)
- e4b->bd_info->bb_first_free = first;
-
/* access memory sequentially: check left neighbour,
* clear range and then check right neighbour
*/
@@ -1941,10 +1936,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
} else {
mb_regenerate_buddy(e4b);
+ goto check;
}
- goto done;
+ return;
}
+ this_cpu_inc(discard_pa_seq);
+ e4b->bd_info->bb_free += count;
+ if (first < e4b->bd_info->bb_first_free)
+ e4b->bd_info->bb_first_free = first;
+
/* let's maintain fragments counter */
if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
@@ -1969,9 +1970,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first <= last)
mb_buddy_mark_free(e4b, first >> 1, last >> 1);
-done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_update_avg_fragment_size(sb, e4b->bd_info);
+check:
mb_check_buddy(e4b);
}
--
2.31.1
[View Less]
In ext4_move_extents(), moved_len is only updated when all moves are
successfully executed, and only discards orig_inode and donor_inode
preallocations when moved_len is not zero. When the loop fails to exit
after successfully moving some extents, moved_len is not updated and
remains at 0, so it does not discard the preallocations.
If the moved extents overlap with the preallocated extents, the
overlapped extents are freed twice in ext4_mb_release_inode_pa() and
ext4_process_freed_data() (as …
[View More]described in commit 94d7c16cbbbd ("ext4:
Fix double-free of blocks with EXT4_IOC_MOVE_EXT")), and bb_free is
incremented twice. Hence when trim is executed, a zero-division bug is
triggered in mb_update_avg_fragment_size() because bb_free is not zero
and bb_fragments is zero.
Therefore, update move_len after each extent move to avoid the issue.
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Reported-by: xingwei lee <xrivendell7(a)gmail.com>
Closes: https://lore.kernel.org/r/CAO4mrferzqBUnCag8R3m2zf897ts9UEuhjFQGPtODT92rYyR…
Fixes: fcf6b1b729bc ("ext4: refactor ext4_move_extents code base")
CC: stable(a)vger.kernel.org # 3.18
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/move_extent.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3aa57376d9c2..391efa6d4c56 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
goto out;
o_end = o_start + len;
+ *moved_len = 0;
while (o_start < o_end) {
struct ext4_extent *ex;
ext4_lblk_t cur_blk, next_blk;
@@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
- move_extent_per_page(o_filp, donor_inode,
+ *moved_len += move_extent_per_page(o_filp, donor_inode,
orig_page_index, donor_page_index,
offset_in_page, cur_len,
unwritten, &ret);
@@ -682,9 +683,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
o_start += cur_len;
d_start += cur_len;
}
- *moved_len = o_start - orig_blk;
- if (*moved_len > len)
- *moved_len = len;
out:
if (*moved_len) {
--
2.31.1
[View Less]
Sending this again because Apple mail decided to default to HTML
mails since the last update apparently *sigh*
Hi,
> On Dec 27, 2023, at 11:30, Johan Hovold <johan(a)kernel.org> wrote:
>
> On Mon, Dec 25, 2023 at 09:26:05PM +0100, Paul Menzel wrote:
>
>> Thank you very much for the patch. I am adding Johan to Cc field.
>
> Thanks for the report. Guess I could use a break from the proverbial
> eggnog.
>
>> Am 25.12.23 um 21:01 schrieb Felix …
[View More]Zhang:
>>> Starting v6.5, Bluetooth does not work at all on my T2 MacBookAir9,1
>>> with the BCM4377 chip. When I boot up the computer, go into
>> Somehow a blank line snug in above.
>>> bluetoothctl, and then try to run commands like scan on, show, list,
>>> it returns "No default controller available." I have tried reloading
>>> the
>> It’d be great if you reflowed for 75 characters per line (also below).
>>> kernel module, in which the log outputs "{Added,Removed} hci0
>>> (unconfigured)." With this patch, I am able to use Bluetooth as
>>> normal
>>> without any errors regarding hci0 being unconfigured. However, an
>>> issue is still present where sometimes hci_bcm4377 will have to be
>>> reloaded in order to get bluetooth to work. I believe this was still
>>> present before the previously mentioned commit.
>>> Due to the bit HCI_QUIRK_USE_BDADDR_PROPERTY being always set in
>>> drivers/bluetooth/hci_bcm4377.c (line 2371), the chip would be left
>>> unconfigured on kernels compiled after commit 6945795bc81a
>>> ("Bluetooth:
>>> fix use-bdaddr-property quirk") due to a change in its logic. On the
>>> M1 Macs, the device would be configured in the devicetree. However,
>>> that is not the case on T2 Macs. Because the bluetooth adapter is
>>> left
>>> unconfigured, it is not usable in the operating system. In order to
>>> circumvent this issue, a flag is added to prevent the bit from being
>>> set on the BCM4377, while setting it on the other devices.
>
> The commit you tracked this down to restored the original semantics for
> HCI_QUIRK_USE_BDADDR_PROPERTY, which means that it should only be set
> for devices with an invalid address.
>
> The Broadcom BCM4377 driver has so far been setting this flag
> unconditionally which now potentially results in also valid addresses
> being marked as invalid.
>
> I've just sent a patch that makes sure to only mark invalid addresses as
> invalid:
>
> https://lore.kernel.org/lkml/20231227101003.10534-1-johan+linaro@kernel.org/
>
> Note however that the flag still needs to be set in case your device
> lacks storage for a unique device address so you cannot simply drop it
> for some device classes as you do below (unless you are certain that
> these devices will always have a valid address).
We do know that though.
BCM4377 is present on Apple’s x86 Macs and always has internal storage
for the address. If the board comes up without an address there’s nothing
much we can do because the address isn’t provided by ACPI or anything
else and setting the invalid address quirk for that situation seems appropriate.
BCM4378/4387 is present on Apple’s ARM Macs and never has internal storage.
The address is always provided by our bootloader in the device tree.
These should always unconditionally set HCI_QUIRK_USE_BDADDR_PROPERTY
just like this patch does.
Best,
Sven
[View Less]
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 926fe783c8a64b33997fec405cf1af3e61aed441
# <…
[View More]resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102922-handwrite-unpopular-0e1d@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 926fe783c8a64b33997fec405cf1af3e61aed441 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii(a)kernel.org>
Date: Fri, 27 Oct 2023 16:31:26 -0700
Subject: [PATCH] tracing/kprobes: Fix symbol counting logic by looking at
modules as well
Recent changes to count number of matching symbols when creating
a kprobe event failed to take into account kernel modules. As such, it
breaks kprobes on kernel module symbols, by assuming there is no match.
Fix this my calling module_kallsyms_on_each_symbol() in addition to
kallsyms_on_each_match_symbol() to perform a proper counting.
Link: https://lore.kernel.org/all/20231027233126.2073148-1-andrii@kernel.org/
Cc: Francis Laniel <flaniel(a)linux.microsoft.com>
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Fixes: b022f0c7e404 ("tracing/kprobes: Return EADDRNOTAVAIL when func matches several symbols")
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Acked-by: Song Liu <song(a)kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 95c5b0668cb7..e834f149695b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -714,14 +714,30 @@ static int count_symbols(void *data, unsigned long unused)
return 0;
}
+struct sym_count_ctx {
+ unsigned int count;
+ const char *name;
+};
+
+static int count_mod_symbols(void *data, const char *name, unsigned long unused)
+{
+ struct sym_count_ctx *ctx = data;
+
+ if (strcmp(name, ctx->name) == 0)
+ ctx->count++;
+
+ return 0;
+}
+
static unsigned int number_of_same_symbols(char *func_name)
{
- unsigned int count;
+ struct sym_count_ctx ctx = { .count = 0, .name = func_name };
+
+ kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count);
- count = 0;
- kallsyms_on_each_match_symbol(count_symbols, func_name, &count);
+ module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx);
- return count;
+ return ctx.count;
}
static int __trace_kprobe_create(int argc, const char *argv[])
[View Less]
Hacks to mask a race between HBA scan job and bridge re-configuration(s)
during hotplug.
I don't like it a bit but it something that could be done quickly
and solves problems that were reported.
Other options to discuss/possibly more invasive:
1: make sure pci_assign_unassigned_bridge_resources() doesn't reconfigure
bridge if it's not necessary.
2. make SCSI_SCAN_ASYNC job wait till hotplug is finished for all slots on
the bridge or somehow restart the job if it fails
3. any other …
[View More]ideas?
1st reported: https://lore.kernel.org/r/9eb669c0-d8f2-431d-a700-6da13053ae54@proxmox.com
CC: Dongli Zhang <dongli.zhang(a)oracle.com>
CC: linux-acpi(a)vger.kernel.org
CC: linux-pci(a)vger.kernel.org
CC: imammedo(a)redhat.com
CC: mst(a)redhat.com
CC: rafael(a)kernel.org
CC: lenb(a)kernel.org
CC: bhelgaas(a)google.com
CC: mika.westerberg(a)linux.intel.com
CC: boris.ostrovsky(a)oracle.com
CC: joe.jin(a)oracle.com
CC: stable(a)vger.kernel.org
CC: linux-kernel(a)vger.kernel.org
CC: Fiona Ebner <f.ebner(a)proxmox.com>
CC: Thomas Lamprecht <t.lamprecht(a)proxmox.com>
Igor Mammedov (2):
PCI: acpiphp: enable slot only if it hasn't been enabled already
PCI: acpiphp: slowdown hotplug if hotplugging multiple devices at a
time
drivers/pci/hotplug/acpiphp_glue.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
--
2.39.3
[View Less]
Hi!
The Tegra20 requires an enabled VDE power domain during startup. As the
VDE is currently not used, it is disabled during runtime.
Since 8f0c714ad9be, there is a workaround for the "normal restart path"
which enables the VDE before doing PMC's warm reboot. This workaround is
not executed in the "emergency restart path", leading to a hang-up
during start.
This series implements and registers a new pmic-based restart handler
for boards with tps6586x. This cold reboot ensures that the VDE …
[View More]power
domain is enabled during startup on tegra20-based boards.
Since bae1d3a05a8b, i2c transfers are non-atomic while preemption is
disabled (which is e.g. done during panic()). This could lead to
warnings ("Voluntary context switch within RCU") in i2c-based restart
handlers during emergency restart. The state of preemption should be
detected by i2c_in_atomic_xfer_mode() to use atomic i2c xfer when
required. Beside the new system_state check, the check is the same as
the one pre v5.2.
---
v7:
- 5/5: drop mode check (suggested by Dmitry)
- Link to v6: https://lore.kernel.org/r/20230327-tegra-pmic-reboot-v6-0-af44a4cd82e9@skid…
v6:
- drop 4/6 to abort restart on unexpected failure (suggested by Dmitry)
- 4,5: fix comments in handlers (suggested by Lee)
- 4,5: same delay for both handlers (suggested by Lee)
v5:
- introduce new 3 & 4, therefore 3 -> 5, 4 -> 6
- 3: provide dev to sys_off handler, if it is known
- 4: return NOTIFY_DONE from sys_off_notify, to avoid skipping
- 5: drop Reviewed-by from Dmitry, add poweroff timeout
- 5,6: return notifier value instead of direct errno from handler
- 5,6: use new dev field instead of passing dev as cb_data
- 5,6: increase timeout values based on error observations
- 6: skip unsupported reboot modes in restart handler
---
Benjamin Bara (5):
kernel/reboot: emergency_restart: set correct system_state
i2c: core: run atomic i2c xfer when !preemptible
kernel/reboot: add device to sys_off_handler
mfd: tps6586x: use devm-based power off handler
mfd: tps6586x: register restart handler
drivers/i2c/i2c-core.h | 2 +-
drivers/mfd/tps6586x.c | 50 ++++++++++++++++++++++++++++++++++++++++++--------
include/linux/reboot.h | 3 +++
kernel/reboot.c | 4 ++++
4 files changed, 50 insertions(+), 9 deletions(-)
---
base-commit: 197b6b60ae7bc51dd0814953c562833143b292aa
change-id: 20230327-tegra-pmic-reboot-4175ff814a4b
Best regards,
--
Benjamin Bara <benjamin.bara(a)skidata.com>
[View Less]
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x b86f4b790c998afdbc88fe1aa55cfe89c4068726
# <…
[View More]resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023123006-koala-satirical-e348@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
b86f4b790c99 ("dm-integrity: don't modify bio's immutable bio_vec in integrity_metadata()")
86a3238c7b9b ("dm: change "unsigned" to "unsigned int"")
7533afa1d27b ("dm: send just one event on resize, not two")
5cd6d1d53a1f ("dm integrity: Remove bi_sector that's only used by commented debug code")
22c40e134c4c ("dm cache: Add some documentation to dm-cache-background-tracker.h")
86e4d3e8d183 ("dm-crypt: provide dma_alignment limit in io_hints")
c3adefb5baf3 ("Merge tag 'for-6.0/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b86f4b790c998afdbc88fe1aa55cfe89c4068726 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Tue, 5 Dec 2023 16:39:16 +0100
Subject: [PATCH] dm-integrity: don't modify bio's immutable bio_vec in
integrity_metadata()
__bio_for_each_segment assumes that the first struct bio_vec argument
doesn't change - it calls "bio_advance_iter_single((bio), &(iter),
(bvl).bv_len)" to advance the iterator. Unfortunately, the dm-integrity
code changes the bio_vec with "bv.bv_len -= pos". When this code path
is taken, the iterator would be out of sync and dm-integrity would
report errors. This happens if the machine is out of memory and
"kmalloc" fails.
Fix this bug by making a copy of "bv" and changing the copy instead.
Fixes: 7eada909bfd7 ("dm: add integrity target")
Cc: stable(a)vger.kernel.org # v4.12+
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Signed-off-by: Mike Snitzer <snitzer(a)kernel.org>
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index e85c688fd91e..c5f03aab4552 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1755,11 +1755,12 @@ static void integrity_metadata(struct work_struct *w)
sectors_to_process = dio->range.n_sectors;
__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
+ struct bio_vec bv_copy = bv;
unsigned int pos;
char *mem, *checksums_ptr;
again:
- mem = bvec_kmap_local(&bv);
+ mem = bvec_kmap_local(&bv_copy);
pos = 0;
checksums_ptr = checksums;
do {
@@ -1768,7 +1769,7 @@ static void integrity_metadata(struct work_struct *w)
sectors_to_process -= ic->sectors_per_block;
pos += ic->sectors_per_block << SECTOR_SHIFT;
sector += ic->sectors_per_block;
- } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
+ } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack);
kunmap_local(mem);
r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
@@ -1793,9 +1794,9 @@ static void integrity_metadata(struct work_struct *w)
if (!sectors_to_process)
break;
- if (unlikely(pos < bv.bv_len)) {
- bv.bv_offset += pos;
- bv.bv_len -= pos;
+ if (unlikely(pos < bv_copy.bv_len)) {
+ bv_copy.bv_offset += pos;
+ bv_copy.bv_len -= pos;
goto again;
}
}
[View Less]
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b803d7c664d55705831729d2f2e29c874bcd62ea
# <…
[View More]resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023123047-tuesday-whooping-6ae3@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
b803d7c664d5 ("ring-buffer: Fix slowpath of interrupted event")
0aa0e5289cfe ("ring-buffer: Have rb_time_cmpxchg() set the msb counter too")
fff88fa0fbc7 ("ring-buffer: Fix a race in rb_time_cmpxchg() for 32 bit archs")
00a8478f8f5c ("ring_buffer: Use try_cmpxchg instead of cmpxchg")
bc92b9562abc ("ring_buffer: Change some static functions to bool")
88ca6a71dcab ("ring-buffer: Handle resize in early boot up")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b803d7c664d55705831729d2f2e29c874bcd62ea Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Date: Mon, 18 Dec 2023 23:07:12 -0500
Subject: [PATCH] ring-buffer: Fix slowpath of interrupted event
To synchronize the timestamps with the ring buffer reservation, there are
two timestamps that are saved in the buffer meta data.
1. before_stamp
2. write_stamp
When the two are equal, the write_stamp is considered valid, as in, it may
be used to calculate the delta of the next event as the write_stamp is the
timestamp of the previous reserved event on the buffer.
This is done by the following:
/*A*/ w = current position on the ring buffer
before = before_stamp
after = write_stamp
ts = read current timestamp
if (before != after) {
write_stamp is not valid, force adding an absolute
timestamp.
}
/*B*/ before_stamp = ts
/*C*/ write = local_add_return(event length, position on ring buffer)
if (w == write - event length) {
/* Nothing interrupted between A and C */
/*E*/ write_stamp = ts;
delta = ts - after
/*
* If nothing interrupted again,
* before_stamp == write_stamp and write_stamp
* can be used to calculate the delta for
* events that come in after this one.
*/
} else {
/*
* The slow path!
* Was interrupted between A and C.
*/
This is the place that there's a bug. We currently have:
after = write_stamp
ts = read current timestamp
/*F*/ if (write == current position on the ring buffer &&
after < ts && cmpxchg(write_stamp, after, ts)) {
delta = ts - after;
} else {
delta = 0;
}
The assumption is that if the current position on the ring buffer hasn't
moved between C and F, then it also was not interrupted, and that the last
event written has a timestamp that matches the write_stamp. That is the
write_stamp is valid.
But this may not be the case:
If a task context event was interrupted by softirq between B and C.
And the softirq wrote an event that got interrupted by a hard irq between
C and E.
and the hard irq wrote an event (does not need to be interrupted)
We have:
/*B*/ before_stamp = ts of normal context
---> interrupted by softirq
/*B*/ before_stamp = ts of softirq context
---> interrupted by hardirq
/*B*/ before_stamp = ts of hard irq context
/*E*/ write_stamp = ts of hard irq context
/* matches and write_stamp valid */
<----
/*E*/ write_stamp = ts of softirq context
/* No longer matches before_stamp, write_stamp is not valid! */
<---
w != write - length, go to slow path
// Right now the order of events in the ring buffer is:
//
// |-- softirq event --|-- hard irq event --|-- normal context event --|
//
after = write_stamp (this is the ts of softirq)
ts = read current timestamp
if (write == current position on the ring buffer [true] &&
after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) {
delta = ts - after [Wrong!]
The delta is to be between the hard irq event and the normal context
event, but the above logic made the delta between the softirq event and
the normal context event, where the hard irq event is between the two. This
will shift all the remaining event timestamps on the sub-buffer
incorrectly.
The write_stamp is only valid if it matches the before_stamp. The cmpxchg
does nothing to help this.
Instead, the following logic can be done to fix this:
before = before_stamp
ts = read current timestamp
before_stamp = ts
after = write_stamp
if (write == current position on the ring buffer &&
after == before && after < ts) {
delta = ts - after
} else {
delta = 0;
}
The above will only use the write_stamp if it still matches before_stamp
and was tested to not have changed since C.
As a bonus, with this logic we do not need any 64-bit cmpxchg() at all!
This means the 32-bit rb_time_t workaround can finally be removed. But
that's for a later time.
Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.…
Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.…
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp")
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5a114e752f11..83eab547f1d1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -700,48 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
return local_try_cmpxchg(l, &expect, set);
}
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- unsigned long cnt, top, bottom, msb;
- unsigned long cnt2, top2, bottom2, msb2;
- u64 val;
-
- /* Any interruptions in this function should cause a failure */
- cnt = local_read(&t->cnt);
-
- /* The cmpxchg always fails if it interrupted an update */
- if (!__rb_time_read(t, &val, &cnt2))
- return false;
-
- if (val != expect)
- return false;
-
- if ((cnt & 3) != cnt2)
- return false;
-
- cnt2 = cnt + 1;
-
- rb_time_split(val, &top, &bottom, &msb);
- msb = rb_time_val_cnt(msb, cnt);
- top = rb_time_val_cnt(top, cnt);
- bottom = rb_time_val_cnt(bottom, cnt);
-
- rb_time_split(set, &top2, &bottom2, &msb2);
- msb2 = rb_time_val_cnt(msb2, cnt);
- top2 = rb_time_val_cnt(top2, cnt2);
- bottom2 = rb_time_val_cnt(bottom2, cnt2);
-
- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
- return false;
- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
- return false;
- if (!rb_time_read_cmpxchg(&t->top, top, top2))
- return false;
- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
- return false;
- return true;
-}
-
#else /* 64 bits */
/* local64_t always succeeds */
@@ -755,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- return local64_try_cmpxchg(&t->time, &expect, set);
-}
#endif
/*
@@ -3610,20 +3563,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
+
+ /* Save the old before_stamp */
+ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /*
+ * Read a new timestamp and update the before_stamp to make
+ * the next event after this one force using an absolute
+ * timestamp. This is in case an interrupt were to come in
+ * between E and F.
+ */
ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_set(&cpu_buffer->before_stamp, ts);
+
+ barrier();
+ /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ /* Was interrupted before here, write_stamp must be valid */
+ RB_WARN_ON(cpu_buffer, !a_ok);
barrier();
- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
- info->after < ts &&
- rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, ts)) {
- /* Nothing came after this event between C and E */
+ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after == info->before && info->after < ts) {
+ /*
+ * Nothing came after this event between C and F, it is
+ * safe to use info->after for the delta as it
+ * matched info->before and is still valid.
+ */
info->delta = ts - info->after;
} else {
/*
- * Interrupted between C and E:
+ * Interrupted between C and F:
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
[View Less]