The si->lock must be held when deleting the si from
the available list. Otherwise, another thread can
re-add the si to the available list, which can lead
to memory corruption. The only place we have found
where this happens is in the swapoff path. This case
can be described as below:
core 0 core 1
swapoff
del_from_avail_list(si) waiting
try lock si->lock acquire swap_avail_lock
and re-add si into
swap_avail_head
acquire si->lock but
missing si already be
added again, and continuing
to clear SWP_WRITEOK, etc.
It can be easily found a massive warning messages can
be triggered inside get_swap_pages() by some special
cases, for example, we call madvise(MADV_PAGEOUT) on
blocks of touched memory concurrently, meanwhile, run
much swapon-swapoff operations (e.g. stress-ng-swap).
However, in the worst case, panic can be caused by the
above scene. In swapoff(), the memory used by si could
be kept in swap_info[] after turning off a swap. This
means memory corruption will not be caused immediately
until allocated and reset for a new swap in the swapon
path. A panic message caused:
(with CONFIG_PLIST_DEBUG enabled)
------------[ cut here ]------------
top: 00000000e58a3003, n: 0000000013e75cda, p: 000000008cd4451a
prev: 0000000035b1e58a, n: 000000008cd4451a, p: 000000002150ee8d
next: 000000008cd4451a, n: 000000008cd4451a, p: 000000008cd4451a
WARNING: CPU: 21 PID: 1843 at lib/plist.c:60 plist_check_prev_next_node+0x50/0x70
Modules linked in: rfkill(E) crct10dif_ce(E)...
CPU: 21 PID: 1843 Comm: stress-ng Kdump: ... 5.10.134+
Hardware name: Alibaba Cloud ECS, BIOS 0.0.0 02/06/2015
pstate: 60400005 (nZCv daif +PAN -UAO -TCO BTYPE=--)
pc : plist_check_prev_next_node+0x50/0x70
lr : plist_check_prev_next_node+0x50/0x70
sp : ffff0018009d3c30
x29: ffff0018009d3c40 x28: ffff800011b32a98
x27: 0000000000000000 x26: ffff001803908000
x25: ffff8000128ea088 x24: ffff800011b32a48
x23: 0000000000000028 x22: ffff001800875c00
x21: ffff800010f9e520 x20: ffff001800875c00
x19: ffff001800fdc6e0 x18: 0000000000000030
x17: 0000000000000000 x16: 0000000000000000
x15: 0736076307640766 x14: 0730073007380731
x13: 0736076307640766 x12: 0730073007380731
x11: 000000000004058d x10: 0000000085a85b76
x9 : ffff8000101436e4 x8 : ffff800011c8ce08
x7 : 0000000000000000 x6 : 0000000000000001
x5 : ffff0017df9ed338 x4 : 0000000000000001
x3 : ffff8017ce62a000 x2 : ffff0017df9ed340
x1 : 0000000000000000 x0 : 0000000000000000
Call trace:
plist_check_prev_next_node+0x50/0x70
plist_check_head+0x80/0xf0
plist_add+0x28/0x140
add_to_avail_list+0x9c/0xf0
_enable_swap_info+0x78/0xb4
__do_sys_swapon+0x918/0xa10
__arm64_sys_swapon+0x20/0x30
el0_svc_common+0x8c/0x220
do_el0_svc+0x2c/0x90
el0_svc+0x1c/0x30
el0_sync_handler+0xa8/0xb0
el0_sync+0x148/0x180
irq event stamp: 2082270
Now, si->lock locked before calling 'del_from_avail_list()'
to make sure other thread see the si had been deleted
and SWP_WRITEOK cleared together, will not reinsert again.
This problem exists in versions after stable 5.10.y.
Cc: stable(a)vger.kernel.org
Tested-by: Yongchen Yin <wb-yyc939293(a)alibaba-inc.com>
Signed-off-by: Rongwei Wang <rongwei.wang(a)linux.alibaba.com>
---
mm/swapfile.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62ba2bf577d7..2c718f45745f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
{
int nid;
+ assert_spin_locked(&p->lock);
for_each_node(nid)
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
@@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- del_from_avail_list(p);
spin_lock(&p->lock);
+ del_from_avail_list(p);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
--
2.27.0
sfp->i2c_block_size is initialized at SFP module insertion in
sfp_sm_mod_probe(). Because of that, if SFP module was not inserted
since boot, ethtool -m leads to zero-length I2C read attempt.
# ethtool -m xge0
i2c i2c-3: adapter quirk: no zero length (addr 0x0050, size 0, read)
Cannot get Module EEPROM data: Operation not supported
If SFP module was plugged then removed at least once,
sfp->i2c_block_size will be initialized and ethtool -m will fail with
different error
# ethtool -m xge0
Cannot get Module EEPROM data: Remote I/O error
Fix this by initializing sfp->i2_block_size at struct sfp allocation
stage so ethtool -m with SFP module removed will fail the same way, i.e.
-EREMOTEIO, in both cases and without errors from I2C adapter.
Signed-off-by: Ivan Bornyakov <i.bornyakov(a)metrotek.ru>
Fixes: 0d035bed2a4a ("net: sfp: VSOL V2801F / CarlitoxxPro CPGOS03-0490 v2.0 workaround")
Cc: stable(a)vger.kernel.org
---
drivers/net/phy/sfp.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 40c9a64c5e30..5663a184644d 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -212,6 +212,12 @@ static const enum gpiod_flags gpio_flags[] = {
#define SFP_PHY_ADDR 22
#define SFP_PHY_ADDR_ROLLBALL 17
+/* SFP_EEPROM_BLOCK_SIZE is the size of data chunk to read the EEPROM
+ * at a time. Some SFP modules and also some Linux I2C drivers do not like
+ * reads longer than 16 bytes.
+ */
+#define SFP_EEPROM_BLOCK_SIZE 16
+
struct sff_data {
unsigned int gpios;
bool (*module_supported)(const struct sfp_eeprom_id *id);
@@ -1928,11 +1934,7 @@ static int sfp_sm_mod_probe(struct sfp *sfp, bool report)
u8 check;
int ret;
- /* Some SFP modules and also some Linux I2C drivers do not like reads
- * longer than 16 bytes, so read the EEPROM in chunks of 16 bytes at
- * a time.
- */
- sfp->i2c_block_size = 16;
+ sfp->i2c_block_size = SFP_EEPROM_BLOCK_SIZE;
ret = sfp_read(sfp, false, 0, &id.base, sizeof(id.base));
if (ret < 0) {
@@ -2615,6 +2617,7 @@ static struct sfp *sfp_alloc(struct device *dev)
return ERR_PTR(-ENOMEM);
sfp->dev = dev;
+ sfp->i2c_block_size = SFP_EEPROM_BLOCK_SIZE;
mutex_init(&sfp->sm_mutex);
mutex_init(&sfp->st_mutex);
--
2.39.2
The patch titled
Subject: mm/huge_memory.c: warn with pr_warn_ratelimited instead of VM_WARN_ON_ONCE_FOLIO
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-huge_memoryc-warn-with-pr_warn_ratelimited-instead-of-vm_warn_on_once_folio.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Subject: mm/huge_memory.c: warn with pr_warn_ratelimited instead of VM_WARN_ON_ONCE_FOLIO
Date: Thu, 6 Apr 2023 17:20:04 +0900
split_huge_page_to_list() WARNs when called for huge zero pages, which
sounds to me too harsh because it does not imply a kernel bug, but just
notifies the event to admins. On the other hand, this is considered as
critical by syzkaller and makes its testing less efficient, which seems to
me harmful.
So replace the VM_WARN_ON_ONCE_FOLIO with pr_warn_ratelimited.
Link: https://lkml.kernel.org/r/20230406082004.2185420-1-naoya.horiguchi@linux.dev
Fixes: 478d134e9506 ("mm/huge_memory: do not overkill when splitting huge_zero_page")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reported-by: syzbot+07a218429c8d19b1fb25(a)syzkaller.appspotmail.com
Link: https://lore.kernel.org/lkml/000000000000a6f34a05e6efcd01@google.com/
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Tetsuo Handa <penguin-kernel(a)i-love.sakura.ne.jp>
Cc: Xu Yu <xuyu(a)linux.alibaba.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/mm/huge_memory.c~mm-huge_memoryc-warn-with-pr_warn_ratelimited-instead-of-vm_warn_on_once_folio
+++ a/mm/huge_memory.c
@@ -2665,9 +2665,10 @@ int split_huge_page_to_list(struct page
VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
is_hzp = is_huge_zero_page(&folio->page);
- VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
- if (is_hzp)
+ if (is_hzp) {
+ pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
return -EBUSY;
+ }
if (folio_test_writeback(folio))
return -EBUSY;
_
Patches currently in -mm which might be from naoya.horiguchi(a)nec.com are
mm-huge_memoryc-warn-with-pr_warn_ratelimited-instead-of-vm_warn_on_once_folio.patch
We keep track of different types of reclaimed pages through
reclaim_state->reclaimed_slab, and we add them to the reported number
of reclaimed pages. For non-memcg reclaim, this makes sense. For memcg
reclaim, we have no clue if those pages are charged to the memcg under
reclaim.
Slab pages are shared by different memcgs, so a freed slab page may have
only been partially charged to the memcg under reclaim. The same goes for
clean file pages from pruned inodes (on highmem systems) or xfs buffer
pages, there is no simple way to currently link them to the memcg under
reclaim.
Stop reporting those freed pages as reclaimed pages during memcg reclaim.
This should make the return value of writing to memory.reclaim, and may
help reduce unnecessary reclaim retries during memcg charging. Writing to
memory.reclaim on the root memcg is considered as cgroup_reclaim(), but
for this case we want to include any freed pages, so use the
global_reclaim() check instead of !cgroup_reclaim().
Generally, this should make the return value of
try_to_free_mem_cgroup_pages() more accurate. In some limited cases (e.g.
freed a slab page that was mostly charged to the memcg under reclaim),
the return value of try_to_free_mem_cgroup_pages() can be underestimated,
but this should be fine. The freed pages will be uncharged anyway, and we
can charge the memcg the next time around as we usually do memcg reclaim
in a retry loop.
The next patch performs some cleanups around reclaim_state and adds an
elaborate comment explaining this to the code. This patch is kept
minimal for easy backporting.
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
Cc: stable(a)vger.kernel.org
---
global_reclaim(sc) does not exist in kernels before 6.3. It can be
replaced with:
!cgroup_reclaim(sc) || mem_cgroup_is_root(sc->target_mem_cgroup)
---
mm/vmscan.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8f..c82bd89f90364 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5346,8 +5346,10 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
- current->reclaim_state->reclaimed_slab = 0;
+ if (global_reclaim(sc)) {
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
+ current->reclaim_state->reclaimed_slab = 0;
+ }
return success ? MEMCG_LRU_YOUNG : 0;
}
@@ -6472,7 +6474,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcgs(pgdat, sc);
- if (reclaim_state) {
+ if (reclaim_state && global_reclaim(sc)) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
--
2.40.0.348.gf938b09366-goog
When the loop over the VMA is terminated early due to an error, the
return code could be overwritten with ENOMEM. Fix the return code by
only setting the error on early loop termination when the error is not
set.
Fixes: 2286a6914c77 ("mm: change mprotect_fixup to vma iterator")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
---
mm/mprotect.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 13e84d8c0797..36351a00c0e8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
tlb_finish_mmu(&tlb);
- if (vma_iter_end(&vmi) < end)
+ if (!error && vma_iter_end(&vmi) < end)
error = -ENOMEM;
out:
--
2.39.2
The patch titled
Subject: mm/mprotect: fix do_mprotect_pkey() return on error
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-mprotect-fix-do_mprotect_pkey-return-on-error.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mm/mprotect: fix do_mprotect_pkey() return on error
Date: Thu, 6 Apr 2023 15:30:50 -0400
When the loop over the VMA is terminated early due to an error, the return
code could be overwritten with ENOMEM. Fix the return code by only
setting the error on early loop termination when the error is not set.
User-visible effects include: attempts to run mprotect() against a special
mapping or with a poorly-aligned hugetlb address should return -EINVAL,
but they presently return -ENOMEM.
Link: https://lkml.kernel.org/r/20230406193050.1363476-1-Liam.Howlett@oracle.com
Fixes: 2286a6914c77 ("mm: change mprotect_fixup to vma iterator")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mprotect.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/mprotect.c~mm-mprotect-fix-do_mprotect_pkey-return-on-error
+++ a/mm/mprotect.c
@@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned lon
}
tlb_finish_mmu(&tlb);
- if (vma_iter_end(&vmi) < end)
+ if (!error && vma_iter_end(&vmi) < end)
error = -ENOMEM;
out:
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
mm-mprotect-fix-do_mprotect_pkey-return-on-error.patch
By default the indirect state sampler data (border colors) are stored
in the same heap as the SAMPLER_STATE structure. For userspace drivers
that can be 2 different heaps (dynamic state heap & bindless sampler
state heap). This means that border colors have to copied in 2
different places so that the same SAMPLER_STATE structure find the
right data.
This change is forcing the indirect state sampler data to only be in
the dynamic state pool (more convinient for userspace drivers, they
only have to have one copy of the border colors). This is reproducing
the behavior of the Windows drivers.
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin(a)intel.com>
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/i915/gt/intel_gt_regs.h | 1 +
drivers/gpu/drm/i915/gt/intel_workarounds.c | 17 +++++++++++++++++
2 files changed, 18 insertions(+)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
index 08d76aa06974c..1aaa471d08c56 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
@@ -1141,6 +1141,7 @@
#define ENABLE_SMALLPL REG_BIT(15)
#define SC_DISABLE_POWER_OPTIMIZATION_EBB REG_BIT(9)
#define GEN11_SAMPLER_ENABLE_HEADLESS_MSG REG_BIT(5)
+#define GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE REG_BIT(0)
#define GEN9_HALF_SLICE_CHICKEN7 MCR_REG(0xe194)
#define DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA REG_BIT(15)
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 32aa1647721ae..734b64e714647 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -2542,6 +2542,23 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
ENABLE_SMALLPL);
}
+ if (GRAPHICS_VER(i915) >= 11) {
+ /* This is not a Wa (although referred to as
+ * WaSetInidrectStateOverride in places), this allows
+ * applications that reference sampler states through
+ * the BindlessSamplerStateBaseAddress to have their
+ * border color relative to DynamicStateBaseAddress
+ * rather than BindlessSamplerStateBaseAddress.
+ *
+ * Otherwise SAMPLER_STATE border colors have to be
+ * copied in multiple heaps (DynamicStateBaseAddress &
+ * BindlessSamplerStateBaseAddress)
+ */
+ wa_mcr_masked_en(wal,
+ GEN10_SAMPLER_MODE,
+ GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
+ }
+
if (GRAPHICS_VER(i915) == 11) {
/* This is not an Wa. Enable for better image quality */
wa_masked_en(wal,
--
2.34.1
The rust_fmt_argument function is called from printk() to handle the %pA
format specifier.
Since it's called from C, we should mark it extern "C" to make sure it's
ABI compatible.
Cc: stable(a)vger.kernel.org
Fixes: 247b365dc8dc ("rust: add `kernel` crate")
Signed-off-by: David Gow <davidgow(a)google.com>
---
See discussion in:
https://github.com/Rust-for-Linux/linux/pull/967
and
https://github.com/Rust-for-Linux/linux/pull/966
---
rust/kernel/print.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs
index 30103325696d..ec457f0952fe 100644
--- a/rust/kernel/print.rs
+++ b/rust/kernel/print.rs
@@ -18,7 +18,7 @@ use crate::bindings;
// Called from `vsprintf` with format specifier `%pA`.
#[no_mangle]
-unsafe fn rust_fmt_argument(buf: *mut c_char, end: *mut c_char, ptr: *const c_void) -> *mut c_char {
+unsafe extern "C" fn rust_fmt_argument(buf: *mut c_char, end: *mut c_char, ptr: *const c_void) -> *mut c_char {
use fmt::Write;
// SAFETY: The C contract guarantees that `buf` is valid if it's less than `end`.
let mut w = unsafe { RawFormatter::from_ptrs(buf.cast(), end.cast()) };
--
2.39.1.581.gbfd45094c4-goog