Just like cow_file_range(), from day 1 btrfs doesn't really clean the
dirty flags, if it has an ordered extent created successfully.
Per error handling protocol (according to the iomap, and the btrfs
handling if it failed at the beginning of the range), we should clear
all dirty flags for the involved folios.
Or the range of that folio will still be marked dirty, but has no
EXTENT_DEALLLOC set inside the io tree.
Since the folio range is still dirty, it will still be the target for
the next writeback, but since there is no EXTENT_DEALLLOC, no new
ordered extent will be created for it.
This means the writeback of that folio range will fall back to COW
fixup, which is being marked deprecated and will trigger a crash.
Unlike the fix in cow_file_range(), which holds the folio and extent
lock until error or a fully successfully run, here we have no such luxury
as we can fallback to COW, and in that case the extent/folio range will
be unlocked by cow_file_range().
So here we introduce a new helper, cleanup_dirty_folios(), to clear the
dirty flags for the involved folios.
Cc: stable(a)vger.kernel.org
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/inode.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 56 insertions(+)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e8232ac7917f..19e1b78508bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1969,6 +1969,46 @@ static int can_nocow_file_extent(struct btrfs_path *path,
return ret < 0 ? ret : can_nocow;
}
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 start, u64 end, int error)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ u32 len;
+
+ ASSERT(end + 1 - start < U32_MAX);
+ len = end + 1 - start;
+
+ /*
+ * Handle the locked folio first.
+ * btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+ */
+ btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
+
+ for (pgoff_t index = start_index; index <= end_index; index++) {
+ struct folio *folio;
+
+ /* Already handled at the beginning. */
+ if (index == locked_folio->index)
+ continue;
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+ /* Cache already dropped, no need to do any cleanup. */
+ if (IS_ERR(folio))
+ continue;
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ mapping_set_error(mapping, error);
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -2228,6 +2268,22 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
return 0;
error:
+ /*
+ * We have some range with ordered extent created.
+ *
+ * Ordered extents and extent maps will be cleaned up by
+ * btrfs_mark_ordered_io_finished() later, but we also need to cleanup
+ * the dirty flags of folios.
+ *
+ * Or they can be written back again, but without any EXTENT_DELALLOC flag
+ * in io tree.
+ * This will force the writeback to go COW fixup, which is being deprecated.
+ *
+ * Also such left-over dirty flags do no follow the error handling protocol.
+ */
+ if (cur_offset > start)
+ cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+
/*
* If an error happened while a COW region is outstanding, cur_offset
* needs to be reset to cow_start to ensure the COW region is unlocked
--
2.47.1
Just like cow_file_range(), from day 1 btrfs doesn't really clean the
dirty flags, if it has an ordered extent created successfully.
Per error handling protocol (according to the iomap, and the btrfs
handling if it failed at the beginning of the range), we should clear
all dirty flags for the involved folios.
Or the range of that folio will still be marked dirty, but has no
EXTENT_DEALLLOC set inside the io tree.
Since the folio range is still dirty, it will still be the target for
the next writeback, but since there is no EXTENT_DEALLLOC, no new
ordered extent will be created for it.
This means the writeback of that folio range will fall back to COW
fixup, which is being marked deprecated and will trigger a crash.
Unlike the fix in cow_file_range(), which holds the folio and extent
lock until error or a fully successfully run, here we have no such luxury
as we can fallback to COW, and in that case the extent/folio range will
be unlocked by cow_file_range().
So here we introduce a new helper, cleanup_dirty_folios(), to clear the
dirty flags for the involved folios.
Cc: stable(a)vger.kernel.org
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/inode.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 56 insertions(+)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e8232ac7917f..19e1b78508bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1969,6 +1969,46 @@ static int can_nocow_file_extent(struct btrfs_path *path,
return ret < 0 ? ret : can_nocow;
}
+static void cleanup_dirty_folios(struct btrfs_inode *inode,
+ struct folio *locked_folio,
+ u64 start, u64 end, int error)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ pgoff_t start_index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
+ u32 len;
+
+ ASSERT(end + 1 - start < U32_MAX);
+ len = end + 1 - start;
+
+ /*
+ * Handle the locked folio first.
+ * btrfs_folio_clamp_*() helpers can handle range out of the folio case.
+ */
+ btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
+
+ for (pgoff_t index = start_index; index <= end_index; index++) {
+ struct folio *folio;
+
+ /* Already handled at the beginning. */
+ if (index == locked_folio->index)
+ continue;
+ folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
+ /* Cache already dropped, no need to do any cleanup. */
+ if (IS_ERR(folio))
+ continue;
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ mapping_set_error(mapping, error);
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -2228,6 +2268,22 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
return 0;
error:
+ /*
+ * We have some range with ordered extent created.
+ *
+ * Ordered extents and extent maps will be cleaned up by
+ * btrfs_mark_ordered_io_finished() later, but we also need to cleanup
+ * the dirty flags of folios.
+ *
+ * Or they can be written back again, but without any EXTENT_DELALLOC flag
+ * in io tree.
+ * This will force the writeback to go COW fixup, which is being deprecated.
+ *
+ * Also such left-over dirty flags do no follow the error handling protocol.
+ */
+ if (cur_offset > start)
+ cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+
/*
* If an error happened while a COW region is outstanding, cur_offset
* needs to be reset to cow_start to ensure the COW region is unlocked
--
2.47.1
The function atomctrl_get_smc_sclk_range_table() does not check the return
value of smu_atom_get_data_table(). If smu_atom_get_data_table() fails to
retrieve SMU_Info table, it returns NULL which is later dereferenced.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: a23eefa2f461 ("drm/amd/powerplay: enable dpm for baffin.")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ivan Stepchenko <sid(a)itb.spb.ru>
---
drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c
index fe24219c3bf4..4bd92fd782be 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/ppatomctrl.c
@@ -992,6 +992,8 @@ int atomctrl_get_smc_sclk_range_table(struct pp_hwmgr *hwmgr, struct pp_atom_ctr
GetIndexIntoMasterTable(DATA, SMU_Info),
&size, &frev, &crev);
+ if (!psmu_info)
+ return -EINVAL;
for (i = 0; i < psmu_info->ucSclkEntryNum; i++) {
table->entry[i].ucVco_setting = psmu_info->asSclkFcwRangeEntry[i].ucVco_setting;
--
2.34.1
Hello,
I am experiencing a significant performance degradation after
upgrading my kernel from version 6.6 to 6.8 and would appreciate any
insights or suggestions.
I am running a high-load simulation system that spawns more than 1000
threads and the overall CPU usage is 30%+ . Most of the threads are
using real-time
scheduling (SCHED_RR), and the threads of a model are using
SCHED_DEADLINE. After upgrading the kernel, I noticed that the
execution time of my model has increased from 4.5ms to 6ms.
What I Have Done So Far:
1. I found this [bug
report](https://bugzilla.kernel.org/show_bug.cgi?id=219366#c7) and
reverted the commit efa7df3e3bb5da8e6abbe37727417f32a37fba47 mentioned
in the post. Unfortunately, this did not resolve the issue.
2. I performed a git bisect and found that after these two commits
related to scheduling (RT and deadline) were merged, the problem
happened. They are 612f769edd06a6e42f7cd72425488e68ddaeef0a,
5fe7765997b139e2d922b58359dea181efe618f9
After reverting these two commits, the model execution time improved
to around 5 ms.
3. I revert two more commits, and the execution time is back to 4.7ms:
63ba8422f876e32ee564ea95da9a7313b13ff0a1,
efa7df3e3bb5da8e6abbe37727417f32a37fba47
My questions are:
1.Has anyone else experienced similar performance degradation after
upgrading to kernel 6.8?
2.Can anyone explain why these two commits are causing the problem? I
am not very familiar with the kernel code and would appreciate any
insights.
3.Are there any additional settings or configurations I need to apply
when using kernel 6.8 to avoid this issue?
My CPU is AMD Ryzen Threadripper 3970X, and my desktop uses Ubuntu 24.04.
Thanks again for any insights or suggestions. Please let me know if
any other information is needed.
Best,
Chenbo
--
This email and any relevant attachments may include confidential and/or
proprietary information. Any distribution or use by anyone other than the
intended recipient(s) or other than for the intended purpose(s) is
prohibited and may be unlawful. If you are not the intended recipient of
this message, please notify the sender by replying to this message and then
delete it from your system.
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f06e108a3dc53c0f5234d18de0bd224753db5019
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024120221-gizzard-thermos-ba19@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f06e108a3dc53c0f5234d18de0bd224753db5019 Mon Sep 17 00:00:00 2001
From: Jan Hendrik Farr <kernel(a)jfarr.cc>
Date: Tue, 29 Oct 2024 15:00:36 +0100
Subject: [PATCH] Compiler Attributes: disable __counted_by for clang < 19.1.3
This patch disables __counted_by for clang versions < 19.1.3 because
of the two issues listed below. It does this by introducing
CONFIG_CC_HAS_COUNTED_BY.
1. clang < 19.1.2 has a bug that can lead to __bdos returning 0:
https://github.com/llvm/llvm-project/pull/110497
2. clang < 19.1.3 has a bug that can lead to __bdos being off by 4:
https://github.com/llvm/llvm-project/pull/112636
Fixes: c8248faf3ca2 ("Compiler Attributes: counted_by: Adjust name and identifier expansion")
Cc: stable(a)vger.kernel.org # 6.6.x: 16c31dd7fdf6: Compiler Attributes: counted_by: bump min gcc version
Cc: stable(a)vger.kernel.org # 6.6.x: 2993eb7a8d34: Compiler Attributes: counted_by: fixup clang URL
Cc: stable(a)vger.kernel.org # 6.6.x: 231dc3f0c936: lkdtm/bugs: Improve warning message for compilers without counted_by support
Cc: stable(a)vger.kernel.org # 6.6.x
Reported-by: Nathan Chancellor <nathan(a)kernel.org>
Closes: https://lore.kernel.org/all/20240913164630.GA4091534@thelio-3990X/
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Closes: https://lore.kernel.org/oe-lkp/202409260949.a1254989-oliver.sang@intel.com
Link: https://lore.kernel.org/all/Zw8iawAF5W2uzGuh@archlinux/T/#m204c09f63c076586…
Suggested-by: Nathan Chancellor <nathan(a)kernel.org>
Signed-off-by: Jan Hendrik Farr <kernel(a)jfarr.cc>
Reviewed-by: Nathan Chancellor <nathan(a)kernel.org>
Tested-by: Nathan Chancellor <nathan(a)kernel.org>
Reviewed-by: Miguel Ojeda <ojeda(a)kernel.org>
Reviewed-by: Thorsten Blum <thorsten.blum(a)linux.dev>
Link: https://lore.kernel.org/r/20241029140036.577804-2-kernel@jfarr.cc
Signed-off-by: Kees Cook <kees(a)kernel.org>
diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 62ba01525479..376047beea3d 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -445,7 +445,7 @@ static void lkdtm_FAM_BOUNDS(void)
pr_err("FAIL: survived access of invalid flexible array member index!\n");
- if (!__has_attribute(__counted_by__))
+ if (!IS_ENABLED(CONFIG_CC_HAS_COUNTED_BY))
pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by\n",
lkdtm_kernel_info);
else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS))
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 32284cd26d52..c16d4199bf92 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -94,19 +94,6 @@
# define __copy(symbol)
#endif
-/*
- * Optional: only supported since gcc >= 15
- * Optional: only supported since clang >= 18
- *
- * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
- * clang: https://github.com/llvm/llvm-project/pull/76348
- */
-#if __has_attribute(__counted_by__)
-# define __counted_by(member) __attribute__((__counted_by__(member)))
-#else
-# define __counted_by(member)
-#endif
-
/*
* Optional: not supported by gcc
* Optional: only supported since clang >= 14.0
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 1a957ea2f4fe..639be0f30b45 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -323,6 +323,25 @@ struct ftrace_likely_data {
#define __no_sanitize_or_inline __always_inline
#endif
+/*
+ * Optional: only supported since gcc >= 15
+ * Optional: only supported since clang >= 18
+ *
+ * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896
+ * clang: https://github.com/llvm/llvm-project/pull/76348
+ *
+ * __bdos on clang < 19.1.2 can erroneously return 0:
+ * https://github.com/llvm/llvm-project/pull/110497
+ *
+ * __bdos on clang < 19.1.3 can be off by 4:
+ * https://github.com/llvm/llvm-project/pull/112636
+ */
+#ifdef CONFIG_CC_HAS_COUNTED_BY
+# define __counted_by(member) __attribute__((__counted_by__(member)))
+#else
+# define __counted_by(member)
+#endif
+
/*
* Apply __counted_by() when the Endianness matches to increase test coverage.
*/
diff --git a/init/Kconfig b/init/Kconfig
index 530a382ee0fe..92f106cf5572 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -116,6 +116,15 @@ config CC_HAS_ASM_INLINE
config CC_HAS_NO_PROFILE_FN_ATTR
def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror)
+config CC_HAS_COUNTED_BY
+ # TODO: when gcc 15 is released remove the build test and add
+ # a gcc version check
+ def_bool $(success,echo 'struct flex { int count; int array[] __attribute__((__counted_by__(count))); };' | $(CC) $(CLANG_FLAGS) -x c - -c -o /dev/null -Werror)
+ # clang needs to be at least 19.1.3 to avoid __bdos miscalculations
+ # https://github.com/llvm/llvm-project/pull/110497
+ # https://github.com/llvm/llvm-project/pull/112636
+ depends on !(CC_IS_CLANG && CLANG_VERSION < 190103)
+
config PAHOLE_VERSION
int
default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c
index 2abc78367dd1..5222c6393f11 100644
--- a/lib/overflow_kunit.c
+++ b/lib/overflow_kunit.c
@@ -1187,7 +1187,7 @@ static void DEFINE_FLEX_test(struct kunit *test)
{
/* Using _RAW_ on a __counted_by struct will initialize "counter" to zero */
DEFINE_RAW_FLEX(struct foo, two_but_zero, array, 2);
-#if __has_attribute(__counted_by__)
+#ifdef CONFIG_CC_HAS_COUNTED_BY
int expected_raw_size = sizeof(struct foo);
#else
int expected_raw_size = sizeof(struct foo) + 2 * sizeof(s16);
USB5744 SMBus initialization is done once in probe() and doing it in resume
is not supported so avoid going into suspend and reset the HUB.
There is a sysfs property 'always_powered_in_suspend' to implement this
feature but since default state should be set to a working configuration
so override this property value.
It fixes the suspend/resume testcase on Kria KR260 Robotics Starter Kit.
Fixes: 6782311d04df ("usb: misc: onboard_usb_dev: add Microchip usb5744 SMBus programming support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey(a)amd.com>
---
drivers/usb/misc/onboard_usb_dev.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/misc/onboard_usb_dev.c b/drivers/usb/misc/onboard_usb_dev.c
index 36b11127280f..75ac3c6aa92d 100644
--- a/drivers/usb/misc/onboard_usb_dev.c
+++ b/drivers/usb/misc/onboard_usb_dev.c
@@ -407,8 +407,10 @@ static int onboard_dev_probe(struct platform_device *pdev)
}
if (of_device_is_compatible(pdev->dev.of_node, "usb424,2744") ||
- of_device_is_compatible(pdev->dev.of_node, "usb424,5744"))
+ of_device_is_compatible(pdev->dev.of_node, "usb424,5744")) {
err = onboard_dev_5744_i2c_init(client);
+ onboard_dev->always_powered_in_suspend = true;
+ }
put_device(&client->dev);
if (err < 0)
--
2.34.1
From: Neal Frager <neal.frager(a)amd.com>
When the USB3 PHY is not defined in the Linux device tree, there could
still be a case where there is a USB3 PHY active on the board and enabled
by the first stage bootloader. If serdes clock is being used then the USB
will fail to enumerate devices in 2.0 only mode.
To solve this, make sure that the PIPE clock is deselected whenever the
USB3 PHY is not defined and guarantees that the USB2 only mode will work
in all cases.
Fixes: 9678f3361afc ("usb: dwc3: xilinx: Skip resets and USB3 register settings for USB2.0 mode")
Cc: stable(a)vger.kernel.org
Signed-off-by: Neal Frager <neal.frager(a)amd.com>
Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey(a)amd.com>
Acked-by: Peter Korsgaard <peter(a)korsgaard.com>
---
Changes for v3:
- Modify commit description to drop second "is" and add Peter ack.
Changes for v2:
- Add stable(a)vger.kernel.org in CC.
---
drivers/usb/dwc3/dwc3-xilinx.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c
index e3738e1610db..a33a42ba0249 100644
--- a/drivers/usb/dwc3/dwc3-xilinx.c
+++ b/drivers/usb/dwc3/dwc3-xilinx.c
@@ -121,8 +121,11 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data)
* in use but the usb3-phy entry is missing from the device tree.
* Therefore, skip these operations in this case.
*/
- if (!priv_data->usb3_phy)
+ if (!priv_data->usb3_phy) {
+ /* Deselect the PIPE Clock Select bit in FPD PIPE Clock register */
+ writel(PIPE_CLK_DESELECT, priv_data->regs + XLNX_USB_FPD_PIPE_CLK);
goto skip_usb3_phy;
+ }
crst = devm_reset_control_get_exclusive(dev, "usb_crst");
if (IS_ERR(crst)) {
--
2.34.1
From: Florian Westphal <fw(a)strlen.de>
commit 18685451fc4e546fc0e718580d32df3c0e5c8272 upstream.
ip_local_out() and other functions can pass skb->sk as function argument.
If the skb is a fragment and reassembly happens before such function call
returns, the sk must not be released.
This affects skb fragments reassembled via netfilter or similar
modules, e.g. openvswitch or ct_act.c, when run as part of tx pipeline.
Eric Dumazet made an initial analysis of this bug. Quoting Eric:
Calling ip_defrag() in output path is also implying skb_orphan(),
which is buggy because output path relies on sk not disappearing.
A relevant old patch about the issue was :
8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()")
[..]
net/ipv4/ip_output.c depends on skb->sk being set, and probably to an
inet socket, not an arbitrary one.
If we orphan the packet in ipvlan, then downstream things like FQ
packet scheduler will not work properly.
We need to change ip_defrag() to only use skb_orphan() when really
needed, ie whenever frag_list is going to be used.
Eric suggested to stash sk in fragment queue and made an initial patch.
However there is a problem with this:
If skb is refragmented again right after, ip_do_fragment() will copy
head->sk to the new fragments, and sets up destructor to sock_wfree.
IOW, we have no choice but to fix up sk_wmem accouting to reflect the
fully reassembled skb, else wmem will underflow.
This change moves the orphan down into the core, to last possible moment.
As ip_defrag_offset is aliased with sk_buff->sk member, we must move the
offset into the FRAG_CB, else skb->sk gets clobbered.
This allows to delay the orphaning long enough to learn if the skb has
to be queued or if the skb is completing the reasm queue.
In the former case, things work as before, skb is orphaned. This is
safe because skb gets queued/stolen and won't continue past reasm engine.
In the latter case, we will steal the skb->sk reference, reattach it to
the head skb, and fix up wmem accouting when inet_frag inflates truesize.
Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().")
Diagnosed-by: Eric Dumazet <edumazet(a)google.com>
Reported-by: xingwei lee <xrivendell7(a)gmail.com>
Reported-by: yue sun <samsun1006219(a)gmail.com>
Reported-by: syzbot+e5167d7144a62715044c(a)syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw(a)strlen.de>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Link: https://lore.kernel.org/r/20240326101845.30836-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
(cherry picked from commit 1b6de5e6575b56502665c65cf93b0ae6aa0f51ab)
Cc: <stable(a)vger.kernel.org> # 4.19+
Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
---
include/linux/skbuff.h | 5 +-
net/core/sock_destructor.h | 12 +++++
net/ipv4/inet_fragment.c | 70 ++++++++++++++++++++-----
net/ipv4/ip_fragment.c | 2 +-
net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
5 files changed, 72 insertions(+), 19 deletions(-)
create mode 100644 net/core/sock_destructor.h
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f97734f34746..f5f76a04fdac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -682,10 +682,7 @@ struct sk_buff {
struct list_head list;
};
- union {
- struct sock *sk;
- int ip_defrag_offset;
- };
+ struct sock *sk;
union {
ktime_t tstamp;
diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h
new file mode 100644
index 000000000000..2f396e6bfba5
--- /dev/null
+++ b/net/core/sock_destructor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
+#define _NET_CORE_SOCK_DESTRUCTOR_H
+#include <net/tcp.h>
+
+static inline bool is_skb_wmem(const struct sk_buff *skb)
+{
+ return skb->destructor == sock_wfree ||
+ skb->destructor == __sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
+}
+#endif
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 9f69411251d0..9144c3cf984c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -28,6 +28,8 @@
#include <net/ip.h>
#include <net/ipv6.h>
+#include "../core/sock_destructor.h"
+
/* Use skb->cb to track consecutive/adjacent fragments coming at
* the end of the queue. Nodes in the rb-tree queue will
* contain "runs" of one or more adjacent fragments.
@@ -43,6 +45,7 @@ struct ipfrag_skb_cb {
};
struct sk_buff *next_frag;
int frag_run_len;
+ int ip_defrag_offset;
};
#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
@@ -319,12 +322,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
*/
if (!last)
fragrun_create(q, skb); /* First fragment. */
- else if (last->ip_defrag_offset + last->len < end) {
+ else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) {
/* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
- if (offset < last->ip_defrag_offset + last->len)
+ if (offset < FRAG_CB(last)->ip_defrag_offset + last->len)
return IPFRAG_OVERLAP;
- if (offset == last->ip_defrag_offset + last->len)
+ if (offset == FRAG_CB(last)->ip_defrag_offset + last->len)
fragrun_append_to_last(q, skb);
else
fragrun_create(q, skb);
@@ -341,13 +344,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
parent = *rbn;
curr = rb_to_skb(parent);
- curr_run_end = curr->ip_defrag_offset +
+ curr_run_end = FRAG_CB(curr)->ip_defrag_offset +
FRAG_CB(curr)->frag_run_len;
- if (end <= curr->ip_defrag_offset)
+ if (end <= FRAG_CB(curr)->ip_defrag_offset)
rbn = &parent->rb_left;
else if (offset >= curr_run_end)
rbn = &parent->rb_right;
- else if (offset >= curr->ip_defrag_offset &&
+ else if (offset >= FRAG_CB(curr)->ip_defrag_offset &&
end <= curr_run_end)
return IPFRAG_DUP;
else
@@ -361,7 +364,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
rb_insert_color(&skb->rbnode, &q->rb_fragments);
}
- skb->ip_defrag_offset = offset;
+ FRAG_CB(skb)->ip_defrag_offset = offset;
return IPFRAG_OK;
}
@@ -371,13 +374,28 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
struct sk_buff *parent)
{
struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
- struct sk_buff **nextp;
+ void (*destructor)(struct sk_buff *);
+ unsigned int orig_truesize = 0;
+ struct sk_buff **nextp = NULL;
+ struct sock *sk = skb->sk;
int delta;
+ if (sk && is_skb_wmem(skb)) {
+ /* TX: skb->sk might have been passed as argument to
+ * dst->output and must remain valid until tx completes.
+ *
+ * Move sk to reassembled skb and fix up wmem accounting.
+ */
+ orig_truesize = skb->truesize;
+ destructor = skb->destructor;
+ }
+
if (head != skb) {
fp = skb_clone(skb, GFP_ATOMIC);
- if (!fp)
- return NULL;
+ if (!fp) {
+ head = skb;
+ goto out_restore_sk;
+ }
FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
if (RB_EMPTY_NODE(&skb->rbnode))
FRAG_CB(parent)->next_frag = fp;
@@ -386,6 +404,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
&q->rb_fragments);
if (q->fragments_tail == skb)
q->fragments_tail = fp;
+
+ if (orig_truesize) {
+ /* prevent skb_morph from releasing sk */
+ skb->sk = NULL;
+ skb->destructor = NULL;
+ }
skb_morph(skb, head);
FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
rb_replace_node(&head->rbnode, &skb->rbnode,
@@ -393,13 +417,13 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
consume_skb(head);
head = skb;
}
- WARN_ON(head->ip_defrag_offset != 0);
+ WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0);
delta = -head->truesize;
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
- return NULL;
+ goto out_restore_sk;
delta += head->truesize;
if (delta)
@@ -415,7 +439,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
- return NULL;
+ goto out_restore_sk;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
@@ -432,6 +456,21 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
nextp = &skb_shinfo(head)->frag_list;
}
+out_restore_sk:
+ if (orig_truesize) {
+ int ts_delta = head->truesize - orig_truesize;
+
+ /* if this reassembled skb is fragmented later,
+ * fraglist skbs will get skb->sk assigned from head->sk,
+ * and each frag skb will be released via sock_wfree.
+ *
+ * Update sk_wmem_alloc.
+ */
+ head->sk = sk;
+ head->destructor = destructor;
+ refcount_add(ts_delta, &sk->sk_wmem_alloc);
+ }
+
return nextp;
}
EXPORT_SYMBOL(inet_frag_reasm_prepare);
@@ -439,6 +478,8 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
void *reasm_data)
{
+ struct sock *sk = is_skb_wmem(head) ? head->sk : NULL;
+ const unsigned int head_truesize = head->truesize;
struct sk_buff **nextp = (struct sk_buff **)reasm_data;
struct rb_node *rbn;
struct sk_buff *fp;
@@ -484,6 +525,9 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
skb_mark_not_on_list(head);
head->prev = NULL;
head->tstamp = q->stamp;
+
+ if (sk)
+ refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(inet_frag_reasm_finish);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 5a1d39e32196..84544e5df7fc 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -380,6 +380,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
}
skb_dst_drop(skb);
+ skb_orphan(skb);
return -EINPROGRESS;
insert_error:
@@ -477,7 +478,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
struct ipq *qp;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
- skb_orphan(skb);
/* Lookup (or create) queue header */
qp = ip_find(net, ip_hdr(skb), user, vif);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 35d5a76867d0..8aab62c330ef 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -307,6 +307,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
}
skb_dst_drop(skb);
+ skb_orphan(skb);
return -EINPROGRESS;
insert_error:
@@ -473,7 +474,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
- skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) {
--
2.46.0