Unaccepted memory is considered unusable free memory, which is not
counted as free on the zone watermark check. This causes
get_page_from_freelist() to accept more memory to hit the high
watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts
to reclaim memory. This is usually successful, but if there is little or
no reclaimable memory, it can result in endless reclaim with little to
no progress. This can occur early in the boot process, just after start
of the init process when the only reclaimable memory is the page cache
of the init executable and its libraries.
Make unaccepted memory free from watermark check point of view. This way
unaccepted memory will never be the trigger of memory reclaim.
Accept more memory in the get_page_from_freelist() if needed.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reported-by: Jianxiong Gao <jxgao(a)google.com>
Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory")
Cc: stable(a)vger.kernel.org # v6.5+
---
mm/page_alloc.c | 42 ++++++++++++++++++++----------------------
1 file changed, 20 insertions(+), 22 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 28f80daf5c04..aa9b1eaa638c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -287,7 +287,7 @@ EXPORT_SYMBOL(nr_online_nodes);
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
@@ -3072,9 +3072,6 @@ static inline long __zone_watermark_unusable_free(struct zone *z,
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
-#ifdef CONFIG_UNACCEPTED_MEMORY
- unusable_free += zone_page_state(z, NR_UNACCEPTED);
-#endif
return unusable_free;
}
@@ -3368,6 +3365,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
}
+ cond_accept_memory(zone, order);
+
/*
* Detect whether the number of free pages is below high
* watermark. If so, we will decrease pcp->high and free
@@ -3393,10 +3392,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
gfp_mask)) {
int ret;
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
@@ -3450,10 +3447,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
return page;
} else {
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
@@ -6951,9 +6946,6 @@ static bool try_to_accept_memory_one(struct zone *zone)
struct page *page;
bool last;
- if (list_empty(&zone->unaccepted_pages))
- return false;
-
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
struct page, lru);
@@ -6979,23 +6971,29 @@ static bool try_to_accept_memory_one(struct zone *zone)
return true;
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
- int ret = false;
+ bool ret = false;
+
+ if (!has_unaccepted_memory())
+ return false;
+
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
/* How much to accept to get to high watermark? */
to_accept = high_wmark_pages(zone) -
(zone_page_state(zone, NR_FREE_PAGES) -
- __zone_watermark_unusable_free(zone, order, 0));
+ __zone_watermark_unusable_free(zone, order, 0) -
+ zone_page_state(zone, NR_UNACCEPTED));
- /* Accept at least one page */
- do {
+ while (to_accept > 0) {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
- } while (to_accept > 0);
+ }
return ret;
}
@@ -7038,7 +7036,7 @@ static void accept_page(struct page *page, unsigned int order)
{
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
return false;
}
--
2.43.0
uevent_show() wants to de-reference dev->driver->name. There is no clean
way for a device attribute to de-reference dev->driver unless that
attribute is defined via (struct device_driver).dev_groups. Instead, the
anti-pattern of taking the device_lock() in the attribute handler risks
deadlocks with code paths that remove device attributes while holding
the lock.
This deadlock is typically invisible to lockdep given the device_lock()
is marked lockdep_set_novalidate_class(), but some subsystems allocate a
local lockdep key for @dev->mutex to reveal reports of the form:
======================================================
WARNING: possible circular locking dependency detected
6.10.0-rc7+ #275 Tainted: G OE N
------------------------------------------------------
modprobe/2374 is trying to acquire lock:
ffff8c2270070de0 (kn->active#6){++++}-{0:0}, at: __kernfs_remove+0xde/0x220
but task is already holding lock:
ffff8c22016e88f8 (&cxl_root_key){+.+.}-{3:3}, at: device_release_driver_internal+0x39/0x210
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&cxl_root_key){+.+.}-{3:3}:
__mutex_lock+0x99/0xc30
uevent_show+0xac/0x130
dev_attr_show+0x18/0x40
sysfs_kf_seq_show+0xac/0xf0
seq_read_iter+0x110/0x450
vfs_read+0x25b/0x340
ksys_read+0x67/0xf0
do_syscall_64+0x75/0x190
entry_SYSCALL_64_after_hwframe+0x76/0x7e
-> #0 (kn->active#6){++++}-{0:0}:
__lock_acquire+0x121a/0x1fa0
lock_acquire+0xd6/0x2e0
kernfs_drain+0x1e9/0x200
__kernfs_remove+0xde/0x220
kernfs_remove_by_name_ns+0x5e/0xa0
device_del+0x168/0x410
device_unregister+0x13/0x60
devres_release_all+0xb8/0x110
device_unbind_cleanup+0xe/0x70
device_release_driver_internal+0x1c7/0x210
driver_detach+0x47/0x90
bus_remove_driver+0x6c/0xf0
cxl_acpi_exit+0xc/0x11 [cxl_acpi]
__do_sys_delete_module.isra.0+0x181/0x260
do_syscall_64+0x75/0x190
entry_SYSCALL_64_after_hwframe+0x76/0x7e
The observation though is that driver objects are typically much longer
lived than device objects. It is reasonable to perform lockless
de-reference of a @driver pointer even if it is racing detach from a
device. Given the infrequency of driver unregistration, use
synchronize_rcu() in module_remove_driver() to close any potential
races. It is potentially overkill to suffer synchronize_rcu() just to
handle the rare module removal racing uevent_show() event.
Thanks to Tetsuo Handa for the debug analysis of the syzbot report [1].
Fixes: c0a40097f0bc ("drivers: core: synchronize really_probe() and dev_uevent()")
Reported-by: syzbot+4762dd74e32532cda5ff(a)syzkaller.appspotmail.com
Reported-by: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
Closes: http://lore.kernel.org/5aa5558f-90a4-4864-b1b1-5d6784c5607d@I-love.SAKURA.n… [1]
Link: http://lore.kernel.org/669073b8ea479_5fffa294c1@dwillia2-xfh.jf.intel.com.n…
Cc: stable(a)vger.kernel.org
Cc: Ashish Sangwan <a.sangwan(a)samsung.com>
Cc: Namjae Jeon <namjae.jeon(a)samsung.com>
Cc: Dirk Behme <dirk.behme(a)de.bosch.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael(a)kernel.org>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
---
drivers/base/core.c | 13 ++++++++-----
drivers/base/module.c | 4 ++++
2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 2b4c0624b704..b5399262198a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -25,6 +25,7 @@
#include <linux/mutex.h>
#include <linux/pm_runtime.h>
#include <linux/netdevice.h>
+#include <linux/rcupdate.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/string_helpers.h>
@@ -2640,6 +2641,7 @@ static const char *dev_uevent_name(const struct kobject *kobj)
static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
const struct device *dev = kobj_to_dev(kobj);
+ struct device_driver *driver;
int retval = 0;
/* add device node properties if present */
@@ -2668,8 +2670,12 @@ static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
if (dev->type && dev->type->name)
add_uevent_var(env, "DEVTYPE=%s", dev->type->name);
- if (dev->driver)
- add_uevent_var(env, "DRIVER=%s", dev->driver->name);
+ /* Synchronize with module_remove_driver() */
+ rcu_read_lock();
+ driver = READ_ONCE(dev->driver);
+ if (driver)
+ add_uevent_var(env, "DRIVER=%s", driver->name);
+ rcu_read_unlock();
/* Add common DT information about the device */
of_device_uevent(dev, env);
@@ -2739,11 +2745,8 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr,
if (!env)
return -ENOMEM;
- /* Synchronize with really_probe() */
- device_lock(dev);
/* let the kset specific function add its keys */
retval = kset->uevent_ops->uevent(&dev->kobj, env);
- device_unlock(dev);
if (retval)
goto out;
diff --git a/drivers/base/module.c b/drivers/base/module.c
index a1b55da07127..b0b79b9c189d 100644
--- a/drivers/base/module.c
+++ b/drivers/base/module.c
@@ -7,6 +7,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/rcupdate.h>
#include "base.h"
static char *make_driver_name(struct device_driver *drv)
@@ -97,6 +98,9 @@ void module_remove_driver(struct device_driver *drv)
if (!drv)
return;
+ /* Synchronize with dev_uevent() */
+ synchronize_rcu();
+
sysfs_remove_link(&drv->p->kobj, "module");
if (drv->owner)
In _emif_get_id(), of_get_address() may return NULL which is later
dereferenced. Fix this bug by adding NULL check. of_translate_address() is
the same.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: 86a18ee21e5e ("EDAC, ti: Add support for TI keystone and DRA7xx EDAC")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
Changes in v4:
- added the check of of_translate_address() as suggestions.
Changes in v3:
- added the patch operations omitted in PATCH v2 RESEND compared to PATCH
v2. Sorry for my oversight.
Changes in v2:
- added Cc stable line.
---
drivers/edac/ti_edac.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/edac/ti_edac.c b/drivers/edac/ti_edac.c
index 29723c9592f7..f466f12630d3 100644
--- a/drivers/edac/ti_edac.c
+++ b/drivers/edac/ti_edac.c
@@ -207,14 +207,24 @@ static int _emif_get_id(struct device_node *node)
int my_id = 0;
addrp = of_get_address(node, 0, NULL, NULL);
+ if (!addrp)
+ return -EINVAL;
+
my_addr = (u32)of_translate_address(node, addrp);
+ if (my_addr == OF_BAD_ADDR)
+ return -EINVAL;
for_each_matching_node(np, ti_edac_of_match) {
if (np == node)
continue;
addrp = of_get_address(np, 0, NULL, NULL);
+ if (!addrp)
+ return -EINVAL;
+
addr = (u32)of_translate_address(np, addrp);
+ if (addr == OF_BAD_ADDR)
+ return -EINVAL;
edac_printk(KERN_INFO, EDAC_MOD_NAME,
"addr=%x, my_addr=%x\n",
--
2.25.1
Le 03/08/2024 à 16:55, Sasha Levin a écrit :
> This is a note to let you know that I've just added the patch titled
>
> ipv4: fix source address selection with route leak
>
> to the 5.15-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> ipv4-fix-source-address-selection-with-route-leak.patch
> and it can be found in the queue-5.15 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
I'm not sure I fully understand the process, but Greg already sent a mail
because this patch doesn't compile on the 5.15 stable branch.
I sent a backport:
https://lore.kernel.org/stable/20240802085305.2749750-1-nicolas.dichtel@6wi…
Regards,
Nicolas
>
>
>
> commit dfd009372d960dc1ccf694e7369d58e63cd133c4
> Author: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
> Date: Wed Jul 10 10:14:27 2024 +0200
>
> ipv4: fix source address selection with route leak
>
> [ Upstream commit 6807352353561187a718e87204458999dbcbba1b ]
>
> By default, an address assigned to the output interface is selected when
> the source address is not specified. This is problematic when a route,
> configured in a vrf, uses an interface from another vrf (aka route leak).
> The original vrf does not own the selected source address.
>
> Let's add a check against the output interface and call the appropriate
> function to select the source address.
>
> CC: stable(a)vger.kernel.org
> Fixes: 8cbb512c923d ("net: Add source address lookup op for VRF")
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
> Reviewed-by: David Ahern <dsahern(a)kernel.org>
> Link: https://patch.msgid.link/20240710081521.3809742-2-nicolas.dichtel@6wind.com
> Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 3d00253afbb8d..4f1236458c214 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -2286,6 +2286,15 @@ void fib_select_path(struct net *net, struct fib_result *res,
> fib_select_default(fl4, res);
>
> check_saddr:
> - if (!fl4->saddr)
> - fl4->saddr = fib_result_prefsrc(net, res);
> + if (!fl4->saddr) {
> + struct net_device *l3mdev;
> +
> + l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);
> +
> + if (!l3mdev ||
> + l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
> + fl4->saddr = fib_result_prefsrc(net, res);
> + else
> + fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
> + }
> }
Hi Greg,
> Andi Shyti (2):
> drm/i915/gem: Adjust vma offset for framebuffer mmap offset
> drm/i915/gem: Fix Virtual Memory mapping boundaries calculation
I have forgotten to Cc the stable mailing list here. These two
patches need to be merged together even if only the second patch
has the "Fixes:" tag.
Is there anything I should still do here?
I could have used the "Requires:" tag, but the commit id would
change in between merges and rebases.
Andi
On 05/08/2024 13:19, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> net: move ethtool-related netdev state into its own struct
>
> to the 6.10-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> net-move-ethtool-related-netdev-state-into-its-own-s.patch
> and it can be found in the queue-6.10 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
This, and the series it's from, are absolutely not -stable material.
The commits do not fix any existing bugs, they are in support of new
features (netlink dumping of RSS contexts), and are a fairly large
and complex set of changes, which have not even stabilised yet — we
have already found issues both within the set and exposed by it in
other code, which are being fixed for 6.11.
> commit e331e73ff4c5c89a7f51a465ae40a7ad9fcd7a28
> Author: Edward Cree <ecree.xilinx(a)gmail.com>
> Date: Thu Jun 27 16:33:46 2024 +0100
>
> net: move ethtool-related netdev state into its own struct
>
> [ Upstream commit 3ebbd9f6de7ec6d538639ebb657246f629ace81e ]
>
> net_dev->ethtool is a pointer to new struct ethtool_netdev_state, which
> currently contains only the wol_enabled field.
>
> Suggested-by: Jakub Kicinski <kuba(a)kernel.org>
> Signed-off-by: Edward Cree <ecree.xilinx(a)gmail.com>
> Reviewed-by: Przemek Kitszel <przemyslaw.kitszel(a)intel.com>
> Link: https://patch.msgid.link/293a562278371de7534ed1eb17531838ca090633.171950223…
> Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
> Stable-dep-of: 7195f0ef7f5b ("ethtool: fix setting key and resetting indir at once")
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
As far as I can tell, 7195f0ef7f5b should backport fairly cleanly
to 6.10 with only simple textual fuzz.
It should not be necessary to backport the "ethtool: track custom
RSS contexts in the core" series to support this.
The above NAK also applies to the backports of:
net-ethtool-attach-an-xarray-of-custom-rss-contexts-.patch
net-ethtool-record-custom-rss-contexts-in-the-xarray.patch
net-ethtool-add-a-mutex-protecting-rss-contexts.patch
which were notified at the same time.
-ed
The result of multiplication between values derived from functions
dir_buckets() and bucket_blocks() *could* technically reach
2^30 * 2^2 = 2^32.
While unlikely to happen, it is prudent to ensure that it will not
lead to integer overflow. Thus, use mul_u32_u32() as it's more
appropriate to mitigate the issue.
Found by Linux Verification Center (linuxtesting.org) with static
analysis tool SVACE.
Fixes: 3843154598a0 ("f2fs: introduce large directory support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
---
fs/f2fs/dir.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index cbd7a5e96a37..14900ca8a9ff 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -166,7 +166,8 @@ static unsigned long dir_block_index(unsigned int level,
unsigned long bidx = 0;
for (i = 0; i < level; i++)
- bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
+ bidx += mul_u32_u32(dir_buckets(i, dir_level),
+ bucket_blocks(i));
bidx += idx * bucket_blocks(level);
return bidx;
}